Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12266:48d6738904a9 libavcodec
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
splits it into small optimization-specific macros which are selected for each
DSP function. The advantage of this approach is that the sse4 functions now
use the ssse3 codepath also without needing an explicit sse4 codepath.
author | rbultje |
---|---|
date | Sat, 24 Jul 2010 19:33:05 +0000 |
parents | c7f6ddcc5c01 |
children | 259988e7ad0f |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
11975 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
148 cextern pb_1 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
149 cextern pw_3 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
150 cextern pb_3 |
11975 | 151 cextern pw_4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
152 cextern pb_4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
153 cextern pw_9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
154 cextern pw_18 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
155 cextern pw_27 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
156 cextern pw_63 |
11975 | 157 cextern pw_64 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
158 cextern pb_80 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
159 cextern pb_F8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
160 cextern pb_FE |
11975 | 161 |
162 SECTION .text | |
163 | |
164 ;----------------------------------------------------------------------------- | |
165 ; subpel MC functions: | |
166 ; | |
167 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
168 ; uint8_t *src, int srcstride, | |
169 ; int height, int mx, int my); | |
170 ;----------------------------------------------------------------------------- | |
171 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
172 %macro FILTER_SSSE3 3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
173 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
174 lea r5d, [r5*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
175 mova m3, [filter_h6_shuf2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 mova m4, [filter_h6_shuf3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 mova m6, [sixtap_filter_hb+r5*8-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 mova m7, [sixtap_filter_hb+r5*8-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 movu m0, [r2-2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 mova m2, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 %ifidn %1, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 ; shuffle with a memory operand |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 punpcklbw m0, [r2+3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 %else |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 pshufb m0, [filter_h6_shuf1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 pshufb m1, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 pshufb m2, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 pmaddubsw m2, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 paddsw m0, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
214 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 shl r5d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 mova m2, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 mova m3, [filter_h2_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 mova m4, [filter_h4_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 mova m6, [fourtap_filter_hb+r5] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 movu m0, [r2-1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 pshufb m0, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 pshufb m1, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
245 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 shl r6d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 mova m5, [fourtap_filter_hb+r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 mova m6, [fourtap_filter_hb+r6] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 mova m7, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 ; read 3 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 movh m1, [r2+ r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 movh m2, [r2+2*r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 movh m3, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 mova m4, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 punpcklbw m4, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 punpcklbw m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 pmaddubsw m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 pmaddubsw m2, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 paddsw m4, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 paddsw m4, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 psraw m4, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 packuswb m4, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 movh [r0], m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
284 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 lea r6d, [r6*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 lea r6, [sixtap_filter_hb+r6*8] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 ; read 5 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 movh m1, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 movh m2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 lea r2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 movh m3, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 movh m4, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 movh m5, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 mova m6, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 punpcklbw m6, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 punpcklbw m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 mova m7, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 punpcklbw m7, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 pmaddubsw m6, [r6-48] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 pmaddubsw m1, [r6-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 pmaddubsw m7, [r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 paddsw m6, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 paddsw m6, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 paddsw m6, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 psraw m6, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 mova m3, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 packuswb m6, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 mova m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 movh [r0], m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
327 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
328 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
329 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
330 %endmacro |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
331 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
332 INIT_MMX |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
333 FILTER_SSSE3 4, 0, 0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
334 INIT_XMM |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
335 FILTER_SSSE3 8, 8, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
336 |
11975 | 337 ; 4x4 block, H-only 4-tap filter |
338 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
339 shl r5d, 4 | |
340 %ifdef PIC | |
341 lea r11, [fourtap_filter_hw_m] | |
342 %endif | |
343 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
344 movq mm5, [fourtap_filter_hw+r5] | |
345 movq mm7, [pw_64] | |
346 pxor mm6, mm6 | |
347 | |
348 .nextrow | |
349 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
350 | |
351 ; first set of 2 pixels | |
352 movq mm2, mm1 ; byte ABCD.. | |
353 punpcklbw mm1, mm6 ; byte->word ABCD | |
354 pshufw mm0, mm2, 9 ; byte CDEF.. | |
355 punpcklbw mm0, mm6 ; byte->word CDEF | |
356 pshufw mm3, mm1, 0x94 ; word ABBC | |
357 pshufw mm1, mm0, 0x94 ; word CDDE | |
358 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
359 movq mm0, mm1 ; backup for second set of pixels | |
360 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
361 paddd mm3, mm1 ; finish 1st 2px | |
362 | |
363 ; second set of 2 pixels, use backup of above | |
364 punpckhbw mm2, mm6 ; byte->word EFGH | |
365 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
366 pshufw mm1, mm2, 0x94 ; word EFFG | |
367 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
368 paddd mm0, mm1 ; finish 2nd 2px | |
369 | |
370 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
371 packssdw mm3, mm0 ; merge dword->word (4px) | |
372 paddsw mm3, mm7 ; rounding | |
373 psraw mm3, 7 | |
374 packuswb mm3, mm6 ; clip and word->bytes | |
375 movd [r0], mm3 ; store | |
376 | |
377 ; go to next line | |
378 add r0, r1 | |
379 add r2, r3 | |
380 dec r4 ; next row | |
381 jg .nextrow | |
382 REP_RET | |
383 | |
384 ; 4x4 block, H-only 6-tap filter | |
385 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
386 lea r5d, [r5*3] | |
387 %ifdef PIC | |
388 lea r11, [sixtap_filter_hw_m] | |
389 %endif | |
390 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
391 movq mm5, [sixtap_filter_hw+r5*8-32] | |
392 movq mm6, [sixtap_filter_hw+r5*8-16] | |
393 movq mm7, [pw_64] | |
394 pxor mm3, mm3 | |
395 | |
396 .nextrow | |
397 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
398 | |
399 ; first set of 2 pixels | |
400 movq mm2, mm1 ; byte ABCD.. | |
401 punpcklbw mm1, mm3 ; byte->word ABCD | |
402 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
403 punpckhbw mm2, mm3 ; byte->word EFGH | |
404 punpcklbw mm0, mm3 ; byte->word CDEF | |
405 pshufw mm1, mm1, 0x94 ; word ABBC | |
406 pshufw mm2, mm2, 0x94 ; word EFFG | |
407 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
408 pshufw mm3, mm0, 0x94 ; word CDDE | |
409 movq mm0, mm3 ; backup for second set of pixels | |
410 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
411 paddd mm1, mm3 ; add to 1st 2px cache | |
412 movq mm3, mm2 ; backup for second set of pixels | |
413 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
414 paddd mm1, mm2 ; finish 1st 2px | |
415 | |
416 ; second set of 2 pixels, use backup of above | |
417 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
418 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
419 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
420 paddd mm0, mm3 ; add to 2nd 2px cache | |
421 pxor mm3, mm3 | |
422 punpcklbw mm2, mm3 ; byte->word FGHI | |
423 pshufw mm2, mm2, 0xE9 ; word GHHI | |
424 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
425 paddd mm0, mm2 ; finish 2nd 2px | |
426 | |
427 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
428 packssdw mm1, mm0 ; merge dword->word (4px) | |
429 paddsw mm1, mm7 ; rounding | |
430 psraw mm1, 7 | |
431 packuswb mm1, mm3 ; clip and word->bytes | |
432 movd [r0], mm1 ; store | |
433 | |
434 ; go to next line | |
435 add r0, r1 | |
436 add r2, r3 | |
437 dec r4 ; next row | |
438 jg .nextrow | |
439 REP_RET | |
440 | |
441 ; 4x4 block, H-only 4-tap filter | |
442 INIT_XMM | |
443 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
444 shl r5d, 4 | |
445 %ifdef PIC | |
446 lea r11, [fourtap_filter_hw_m] | |
447 %endif | |
448 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
449 mova m6, [fourtap_filter_hw+r5] | |
450 pxor m7, m7 | |
451 | |
452 .nextrow | |
453 movh m0, [r2-1] | |
454 punpcklbw m0, m7 ; ABCDEFGH | |
455 mova m1, m0 | |
456 mova m2, m0 | |
457 mova m3, m0 | |
458 psrldq m1, 2 ; BCDEFGH | |
459 psrldq m2, 4 ; CDEFGH | |
460 psrldq m3, 6 ; DEFGH | |
461 punpcklwd m0, m1 ; ABBCCDDE | |
462 punpcklwd m2, m3 ; CDDEEFFG | |
463 pmaddwd m0, m5 | |
464 pmaddwd m2, m6 | |
465 paddd m0, m2 | |
466 | |
467 movh m1, [r2+3] | |
468 punpcklbw m1, m7 ; ABCDEFGH | |
469 mova m2, m1 | |
470 mova m3, m1 | |
471 mova m4, m1 | |
472 psrldq m2, 2 ; BCDEFGH | |
473 psrldq m3, 4 ; CDEFGH | |
474 psrldq m4, 6 ; DEFGH | |
475 punpcklwd m1, m2 ; ABBCCDDE | |
476 punpcklwd m3, m4 ; CDDEEFFG | |
477 pmaddwd m1, m5 | |
478 pmaddwd m3, m6 | |
479 paddd m1, m3 | |
480 | |
481 packssdw m0, m1 | |
482 paddsw m0, [pw_64] | |
483 psraw m0, 7 | |
484 packuswb m0, m7 | |
485 movh [r0], m0 ; store | |
486 | |
487 ; go to next line | |
488 add r0, r1 | |
489 add r2, r3 | |
490 dec r4 ; next row | |
491 jg .nextrow | |
492 REP_RET | |
493 | |
494 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
495 lea r5d, [r5*3] | |
496 %ifdef PIC | |
497 lea r11, [sixtap_filter_hw_m] | |
498 %endif | |
499 lea r5, [sixtap_filter_hw+r5*8] | |
500 pxor m7, m7 | |
501 | |
502 .nextrow | |
503 movu m0, [r2-2] | |
504 mova m6, m0 | |
505 mova m4, m0 | |
506 punpcklbw m0, m7 ; ABCDEFGHI | |
507 mova m1, m0 | |
508 mova m2, m0 | |
509 mova m3, m0 | |
510 psrldq m1, 2 ; BCDEFGH | |
511 psrldq m2, 4 ; CDEFGH | |
512 psrldq m3, 6 ; DEFGH | |
513 psrldq m4, 4 | |
514 punpcklbw m4, m7 ; EFGH | |
515 mova m5, m4 | |
516 psrldq m5, 2 ; FGH | |
517 punpcklwd m0, m1 ; ABBCCDDE | |
518 punpcklwd m2, m3 ; CDDEEFFG | |
519 punpcklwd m4, m5 ; EFFGGHHI | |
520 pmaddwd m0, [r5-48] | |
521 pmaddwd m2, [r5-32] | |
522 pmaddwd m4, [r5-16] | |
523 paddd m0, m2 | |
524 paddd m0, m4 | |
525 | |
526 psrldq m6, 4 | |
527 mova m4, m6 | |
528 punpcklbw m6, m7 ; ABCDEFGHI | |
529 mova m1, m6 | |
530 mova m2, m6 | |
531 mova m3, m6 | |
532 psrldq m1, 2 ; BCDEFGH | |
533 psrldq m2, 4 ; CDEFGH | |
534 psrldq m3, 6 ; DEFGH | |
535 psrldq m4, 4 | |
536 punpcklbw m4, m7 ; EFGH | |
537 mova m5, m4 | |
538 psrldq m5, 2 ; FGH | |
539 punpcklwd m6, m1 ; ABBCCDDE | |
540 punpcklwd m2, m3 ; CDDEEFFG | |
541 punpcklwd m4, m5 ; EFFGGHHI | |
542 pmaddwd m6, [r5-48] | |
543 pmaddwd m2, [r5-32] | |
544 pmaddwd m4, [r5-16] | |
545 paddd m6, m2 | |
546 paddd m6, m4 | |
547 | |
548 packssdw m0, m6 | |
549 paddsw m0, [pw_64] | |
550 psraw m0, 7 | |
551 packuswb m0, m7 | |
552 movh [r0], m0 ; store | |
553 | |
554 ; go to next line | |
555 add r0, r1 | |
556 add r2, r3 | |
557 dec r4 ; next row | |
558 jg .nextrow | |
559 REP_RET | |
560 | |
561 %macro FILTER_V 3 | |
562 ; 4x4 block, V-only 4-tap filter | |
563 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
564 shl r6d, 5 | |
565 %ifdef PIC | |
566 lea r11, [fourtap_filter_v_m] | |
567 %endif | |
568 lea r6, [fourtap_filter_v+r6-32] | |
569 mova m6, [pw_64] | |
570 pxor m7, m7 | |
571 mova m5, [r6+48] | |
572 | |
573 ; read 3 lines | |
574 sub r2, r3 | |
575 movh m0, [r2] | |
576 movh m1, [r2+ r3] | |
577 movh m2, [r2+2*r3] | |
578 add r2, r3 | |
579 punpcklbw m0, m7 | |
580 punpcklbw m1, m7 | |
581 punpcklbw m2, m7 | |
582 | |
583 .nextrow | |
584 ; first calculate negative taps (to prevent losing positive overflows) | |
585 movh m4, [r2+2*r3] ; read new row | |
586 punpcklbw m4, m7 | |
587 mova m3, m4 | |
588 pmullw m0, [r6+0] | |
589 pmullw m4, m5 | |
590 paddsw m4, m0 | |
591 | |
592 ; then calculate positive taps | |
593 mova m0, m1 | |
594 pmullw m1, [r6+16] | |
595 paddsw m4, m1 | |
596 mova m1, m2 | |
597 pmullw m2, [r6+32] | |
598 paddsw m4, m2 | |
599 mova m2, m3 | |
600 | |
601 ; round/clip/store | |
602 paddsw m4, m6 | |
603 psraw m4, 7 | |
604 packuswb m4, m7 | |
605 movh [r0], m4 | |
606 | |
607 ; go to next line | |
608 add r0, r1 | |
609 add r2, r3 | |
610 dec r4 ; next row | |
611 jg .nextrow | |
612 REP_RET | |
613 | |
614 | |
615 ; 4x4 block, V-only 6-tap filter | |
616 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
617 shl r6d, 4 | |
618 lea r6, [r6*3] | |
619 %ifdef PIC | |
620 lea r11, [sixtap_filter_v_m] | |
621 %endif | |
622 lea r6, [sixtap_filter_v+r6-96] | |
623 pxor m7, m7 | |
624 | |
625 ; read 5 lines | |
626 sub r2, r3 | |
627 sub r2, r3 | |
628 movh m0, [r2] | |
629 movh m1, [r2+r3] | |
630 movh m2, [r2+r3*2] | |
631 lea r2, [r2+r3*2] | |
632 add r2, r3 | |
633 movh m3, [r2] | |
634 movh m4, [r2+r3] | |
635 punpcklbw m0, m7 | |
636 punpcklbw m1, m7 | |
637 punpcklbw m2, m7 | |
638 punpcklbw m3, m7 | |
639 punpcklbw m4, m7 | |
640 | |
641 .nextrow | |
642 ; first calculate negative taps (to prevent losing positive overflows) | |
643 mova m5, m1 | |
644 pmullw m5, [r6+16] | |
645 mova m6, m4 | |
646 pmullw m6, [r6+64] | |
647 paddsw m6, m5 | |
648 | |
649 ; then calculate positive taps | |
650 movh m5, [r2+2*r3] ; read new row | |
651 punpcklbw m5, m7 | |
652 pmullw m0, [r6+0] | |
653 paddsw m6, m0 | |
654 mova m0, m1 | |
655 mova m1, m2 | |
656 pmullw m2, [r6+32] | |
657 paddsw m6, m2 | |
658 mova m2, m3 | |
659 pmullw m3, [r6+48] | |
660 paddsw m6, m3 | |
661 mova m3, m4 | |
662 mova m4, m5 | |
663 pmullw m5, [r6+80] | |
664 paddsw m6, m5 | |
665 | |
666 ; round/clip/store | |
667 paddsw m6, [pw_64] | |
668 psraw m6, 7 | |
669 packuswb m6, m7 | |
670 movh [r0], m6 | |
671 | |
672 ; go to next line | |
673 add r0, r1 | |
674 add r2, r3 | |
675 dec r4 ; next row | |
676 jg .nextrow | |
677 REP_RET | |
678 %endmacro | |
679 | |
680 INIT_MMX | |
681 FILTER_V mmxext, 4, 0 | |
682 INIT_XMM | |
683 FILTER_V sse2, 8, 8 | |
684 | |
11991 | 685 %macro FILTER_BILINEAR 3 |
686 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
687 mov r5d, 8*16 | |
688 shl r6d, 4 | |
689 sub r5d, r6d | |
690 %ifdef PIC | |
691 lea r11, [bilinear_filter_vw_m] | |
692 %endif | |
693 pxor m6, m6 | |
12000 | 694 mova m4, [bilinear_filter_vw+r5-16] |
695 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 696 .nextrow |
697 movh m0, [r2+r3*0] | |
698 movh m1, [r2+r3*1] | |
699 movh m3, [r2+r3*2] | |
700 punpcklbw m0, m6 | |
701 punpcklbw m1, m6 | |
702 punpcklbw m3, m6 | |
703 mova m2, m1 | |
704 pmullw m0, m4 | |
705 pmullw m1, m5 | |
706 pmullw m2, m4 | |
707 pmullw m3, m5 | |
708 paddsw m0, m1 | |
709 paddsw m2, m3 | |
710 psraw m0, 2 | |
711 psraw m2, 2 | |
712 pavgw m0, m6 | |
713 pavgw m2, m6 | |
714 %ifidn %1, mmxext | |
715 packuswb m0, m0 | |
716 packuswb m2, m2 | |
717 movh [r0+r1*0], m0 | |
718 movh [r0+r1*1], m2 | |
719 %else | |
720 packuswb m0, m2 | |
721 movh [r0+r1*0], m0 | |
722 movhps [r0+r1*1], m0 | |
723 %endif | |
724 | |
725 lea r0, [r0+r1*2] | |
726 lea r2, [r2+r3*2] | |
727 sub r4, 2 | |
728 jg .nextrow | |
729 REP_RET | |
730 | |
731 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
732 mov r6d, 8*16 | |
733 shl r5d, 4 | |
734 sub r6d, r5d | |
735 %ifdef PIC | |
736 lea r11, [bilinear_filter_vw_m] | |
737 %endif | |
738 pxor m6, m6 | |
12000 | 739 mova m4, [bilinear_filter_vw+r6-16] |
740 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 741 .nextrow |
742 movh m0, [r2+r3*0+0] | |
743 movh m1, [r2+r3*0+1] | |
744 movh m2, [r2+r3*1+0] | |
745 movh m3, [r2+r3*1+1] | |
746 punpcklbw m0, m6 | |
747 punpcklbw m1, m6 | |
748 punpcklbw m2, m6 | |
749 punpcklbw m3, m6 | |
750 pmullw m0, m4 | |
751 pmullw m1, m5 | |
752 pmullw m2, m4 | |
753 pmullw m3, m5 | |
754 paddsw m0, m1 | |
755 paddsw m2, m3 | |
756 psraw m0, 2 | |
757 psraw m2, 2 | |
758 pavgw m0, m6 | |
759 pavgw m2, m6 | |
760 %ifidn %1, mmxext | |
761 packuswb m0, m0 | |
762 packuswb m2, m2 | |
763 movh [r0+r1*0], m0 | |
764 movh [r0+r1*1], m2 | |
765 %else | |
766 packuswb m0, m2 | |
767 movh [r0+r1*0], m0 | |
768 movhps [r0+r1*1], m0 | |
769 %endif | |
770 | |
771 lea r0, [r0+r1*2] | |
772 lea r2, [r2+r3*2] | |
773 sub r4, 2 | |
774 jg .nextrow | |
775 REP_RET | |
776 %endmacro | |
777 | |
778 INIT_MMX | |
779 FILTER_BILINEAR mmxext, 4, 0 | |
780 INIT_XMM | |
781 FILTER_BILINEAR sse2, 8, 7 | |
782 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
783 %macro FILTER_BILINEAR_SSSE3 1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
784 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
11991 | 785 shl r6d, 4 |
786 %ifdef PIC | |
787 lea r11, [bilinear_filter_vb_m] | |
788 %endif | |
789 pxor m4, m4 | |
12000 | 790 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 791 .nextrow |
792 movh m0, [r2+r3*0] | |
793 movh m1, [r2+r3*1] | |
794 movh m2, [r2+r3*2] | |
795 punpcklbw m0, m1 | |
796 punpcklbw m1, m2 | |
797 pmaddubsw m0, m3 | |
798 pmaddubsw m1, m3 | |
799 psraw m0, 2 | |
800 psraw m1, 2 | |
801 pavgw m0, m4 | |
802 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
803 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
804 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
805 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
806 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
807 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
808 %else |
11991 | 809 packuswb m0, m1 |
810 movh [r0+r1*0], m0 | |
811 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
812 %endif |
11991 | 813 |
814 lea r0, [r0+r1*2] | |
815 lea r2, [r2+r3*2] | |
816 sub r4, 2 | |
817 jg .nextrow | |
818 REP_RET | |
819 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
820 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
11991 | 821 shl r5d, 4 |
822 %ifdef PIC | |
823 lea r11, [bilinear_filter_vb_m] | |
824 %endif | |
825 pxor m4, m4 | |
826 mova m2, [filter_h2_shuf] | |
12000 | 827 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 828 .nextrow |
829 movu m0, [r2+r3*0] | |
830 movu m1, [r2+r3*1] | |
831 pshufb m0, m2 | |
832 pshufb m1, m2 | |
833 pmaddubsw m0, m3 | |
834 pmaddubsw m1, m3 | |
835 psraw m0, 2 | |
836 psraw m1, 2 | |
837 pavgw m0, m4 | |
838 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
839 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
840 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
841 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
842 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
843 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
844 %else |
11991 | 845 packuswb m0, m1 |
846 movh [r0+r1*0], m0 | |
847 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
848 %endif |
11991 | 849 |
850 lea r0, [r0+r1*2] | |
851 lea r2, [r2+r3*2] | |
852 sub r4, 2 | |
853 jg .nextrow | |
854 REP_RET | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
855 %endmacro |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
856 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
857 INIT_MMX |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
858 FILTER_BILINEAR_SSSE3 4 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
859 INIT_XMM |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
860 FILTER_BILINEAR_SSSE3 8 |
11991 | 861 |
11992 | 862 cglobal put_vp8_pixels8_mmx, 5,5 |
863 .nextrow: | |
864 movq mm0, [r2+r3*0] | |
865 movq mm1, [r2+r3*1] | |
866 lea r2, [r2+r3*2] | |
867 movq [r0+r1*0], mm0 | |
868 movq [r0+r1*1], mm1 | |
869 lea r0, [r0+r1*2] | |
870 sub r4d, 2 | |
871 jg .nextrow | |
872 REP_RET | |
873 | |
874 cglobal put_vp8_pixels16_mmx, 5,5 | |
875 .nextrow: | |
876 movq mm0, [r2+r3*0+0] | |
877 movq mm1, [r2+r3*0+8] | |
878 movq mm2, [r2+r3*1+0] | |
879 movq mm3, [r2+r3*1+8] | |
880 lea r2, [r2+r3*2] | |
881 movq [r0+r1*0+0], mm0 | |
882 movq [r0+r1*0+8], mm1 | |
883 movq [r0+r1*1+0], mm2 | |
884 movq [r0+r1*1+8], mm3 | |
885 lea r0, [r0+r1*2] | |
886 sub r4d, 2 | |
887 jg .nextrow | |
888 REP_RET | |
889 | |
890 cglobal put_vp8_pixels16_sse, 5,5,2 | |
891 .nextrow: | |
892 movups xmm0, [r2+r3*0] | |
893 movups xmm1, [r2+r3*1] | |
894 lea r2, [r2+r3*2] | |
895 movaps [r0+r1*0], xmm0 | |
896 movaps [r0+r1*1], xmm1 | |
897 lea r0, [r0+r1*2] | |
898 sub r4d, 2 | |
899 jg .nextrow | |
900 REP_RET | |
901 | |
11975 | 902 ;----------------------------------------------------------------------------- |
903 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
904 ;----------------------------------------------------------------------------- | |
905 | |
12238 | 906 %macro ADD_DC 4 |
907 %4 m2, [r0+%3] | |
908 %4 m3, [r0+r2+%3] | |
909 %4 m4, [r1+%3] | |
910 %4 m5, [r1+r2+%3] | |
911 paddusb m2, %1 | |
912 paddusb m3, %1 | |
913 paddusb m4, %1 | |
914 paddusb m5, %1 | |
915 psubusb m2, %2 | |
916 psubusb m3, %2 | |
917 psubusb m4, %2 | |
918 psubusb m5, %2 | |
919 %4 [r0+%3], m2 | |
920 %4 [r0+r2+%3], m3 | |
921 %4 [r1+%3], m4 | |
922 %4 [r1+r2+%3], m5 | |
923 %endmacro | |
924 | |
925 INIT_MMX | |
11975 | 926 cglobal vp8_idct_dc_add_mmx, 3, 3 |
927 ; load data | |
12238 | 928 movd m0, [r1] |
11975 | 929 |
930 ; calculate DC | |
12238 | 931 paddw m0, [pw_4] |
932 pxor m1, m1 | |
933 psraw m0, 3 | |
934 movd [r1], m1 | |
935 psubw m1, m0 | |
936 packuswb m0, m0 | |
937 packuswb m1, m1 | |
938 punpcklbw m0, m0 | |
939 punpcklbw m1, m1 | |
940 punpcklwd m0, m0 | |
941 punpcklwd m1, m1 | |
11975 | 942 |
943 ; add DC | |
12238 | 944 lea r1, [r0+r2*2] |
945 ADD_DC m0, m1, 0, movh | |
11975 | 946 RET |
947 | |
12238 | 948 INIT_XMM |
11975 | 949 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
950 ; load data | |
12238 | 951 movd m0, [r1] |
952 pxor m1, m1 | |
953 | |
954 ; calculate DC | |
955 paddw m0, [pw_4] | |
956 movd [r1], m1 | |
957 lea r1, [r0+r2*2] | |
958 movd m2, [r0] | |
959 movd m3, [r0+r2] | |
960 movd m4, [r1] | |
961 movd m5, [r1+r2] | |
962 psraw m0, 3 | |
963 pshuflw m0, m0, 0 | |
964 punpcklqdq m0, m0 | |
965 punpckldq m2, m3 | |
966 punpckldq m4, m5 | |
967 punpcklbw m2, m1 | |
968 punpcklbw m4, m1 | |
969 paddw m2, m0 | |
970 paddw m4, m0 | |
971 packuswb m2, m4 | |
972 movd [r0], m2 | |
973 pextrd [r0+r2], m2, 1 | |
974 pextrd [r1], m2, 2 | |
975 pextrd [r1+r2], m2, 3 | |
976 RET | |
977 | |
978 ;----------------------------------------------------------------------------- | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
12238 | 980 ;----------------------------------------------------------------------------- |
981 | |
982 INIT_MMX | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
983 cglobal vp8_idct_dc_add4y_mmx, 3, 3 |
12238 | 984 ; load data |
985 movd m0, [r1+32*0] ; A | |
986 movd m1, [r1+32*2] ; C | |
987 punpcklwd m0, [r1+32*1] ; A B | |
988 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 989 punpckldq m0, m1 ; A B C D |
12238 | 990 pxor m6, m6 |
11975 | 991 |
992 ; calculate DC | |
12238 | 993 paddw m0, [pw_4] |
994 movd [r1+32*0], m6 | |
995 movd [r1+32*1], m6 | |
996 movd [r1+32*2], m6 | |
997 movd [r1+32*3], m6 | |
998 psraw m0, 3 | |
999 psubw m6, m0 | |
1000 packuswb m0, m0 | |
1001 packuswb m6, m6 | |
1002 punpcklbw m0, m0 ; AABBCCDD | |
1003 punpcklbw m6, m6 ; AABBCCDD | |
1004 movq m1, m0 | |
1005 movq m7, m6 | |
1006 punpcklbw m0, m0 ; AAAABBBB | |
1007 punpckhbw m1, m1 ; CCCCDDDD | |
1008 punpcklbw m6, m6 ; AAAABBBB | |
1009 punpckhbw m7, m7 ; CCCCDDDD | |
1010 | |
1011 ; add DC | |
1012 lea r1, [r0+r2*2] | |
1013 ADD_DC m0, m6, 0, mova | |
1014 ADD_DC m1, m7, 8, mova | |
1015 RET | |
1016 | |
1017 INIT_XMM | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1018 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |
12238 | 1019 ; load data |
1020 movd m0, [r1+32*0] ; A | |
1021 movd m1, [r1+32*2] ; C | |
1022 punpcklwd m0, [r1+32*1] ; A B | |
1023 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 1024 punpckldq m0, m1 ; A B C D |
12238 | 1025 pxor m1, m1 |
1026 | |
1027 ; calculate DC | |
1028 paddw m0, [pw_4] | |
1029 movd [r1+32*0], m1 | |
1030 movd [r1+32*1], m1 | |
1031 movd [r1+32*2], m1 | |
1032 movd [r1+32*3], m1 | |
1033 psraw m0, 3 | |
1034 psubw m1, m0 | |
1035 packuswb m0, m0 | |
1036 packuswb m1, m1 | |
1037 punpcklbw m0, m0 | |
1038 punpcklbw m1, m1 | |
1039 punpcklbw m0, m0 | |
1040 punpcklbw m1, m1 | |
1041 | |
1042 ; add DC | |
1043 lea r1, [r0+r2*2] | |
1044 ADD_DC m0, m1, 0, mova | |
11975 | 1045 RET |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1046 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1047 ;----------------------------------------------------------------------------- |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1048 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1049 ;----------------------------------------------------------------------------- |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1050 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1051 INIT_MMX |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1052 cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1053 ; load data |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1054 movd m0, [r1+32*0] ; A |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1055 movd m1, [r1+32*2] ; C |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1056 punpcklwd m0, [r1+32*1] ; A B |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1057 punpcklwd m1, [r1+32*3] ; C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1058 punpckldq m0, m1 ; A B C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1059 pxor m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1060 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1061 ; calculate DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1062 paddw m0, [pw_4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1063 movd [r1+32*0], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1064 movd [r1+32*1], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1065 movd [r1+32*2], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1066 movd [r1+32*3], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1067 psraw m0, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1068 psubw m6, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1069 packuswb m0, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1070 packuswb m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1071 punpcklbw m0, m0 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1072 punpcklbw m6, m6 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1073 movq m1, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1074 movq m7, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1075 punpcklbw m0, m0 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1076 punpckhbw m1, m1 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1077 punpcklbw m6, m6 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1078 punpckhbw m7, m7 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1079 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1080 ; add DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1081 lea r1, [r0+r2*2] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1082 ADD_DC m0, m6, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1083 lea r0, [r0+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1084 lea r1, [r1+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1085 ADD_DC m1, m7, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1086 RET |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1087 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1088 ;----------------------------------------------------------------------------- |
12013 | 1089 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
1090 ;----------------------------------------------------------------------------- | |
1091 | |
1092 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
1093 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
1094 %macro VP8_MULTIPLY_SUMSUB 4 | |
1095 mova %3, %1 | |
1096 mova %4, %2 | |
1097 pmulhw %3, m6 ;20091(1) | |
1098 pmulhw %4, m6 ;20091(2) | |
1099 paddw %3, %1 | |
1100 paddw %4, %2 | |
12018 | 1101 paddw %1, %1 |
1102 paddw %2, %2 | |
12013 | 1103 pmulhw %1, m7 ;35468(1) |
1104 pmulhw %2, m7 ;35468(2) | |
1105 psubw %1, %4 | |
1106 paddw %2, %3 | |
1107 %endmacro | |
1108 | |
1109 ; calculate x0=%1+%3; x1=%1-%3 | |
1110 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
1111 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
1112 ; %5/%6 are temporary registers | |
1113 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
1114 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
1115 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
1116 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
1117 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
1118 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
1119 SWAP %4, %1 | |
1120 SWAP %4, %3 | |
1121 %endmacro | |
1122 | |
1123 INIT_MMX | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1124 %macro VP8_IDCT_ADD 1 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1125 cglobal vp8_idct_add_%1, 3, 3 |
12013 | 1126 ; load block data |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1127 movq m0, [r1+ 0] |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1128 movq m1, [r1+ 8] |
12013 | 1129 movq m2, [r1+16] |
1130 movq m3, [r1+24] | |
1131 movq m6, [pw_20091] | |
1132 movq m7, [pw_17734] | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1133 %ifidn %1, sse |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1134 xorps xmm0, xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1135 movaps [r1+ 0], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1136 movaps [r1+16], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1137 %else |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1138 pxor m4, m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1139 movq [r1+ 0], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1140 movq [r1+ 8], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1141 movq [r1+16], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1142 movq [r1+24], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1143 %endif |
12013 | 1144 |
1145 ; actual IDCT | |
1146 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1147 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1148 paddw m0, [pw_4] | |
1149 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1150 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1151 | |
1152 ; store | |
1153 pxor m4, m4 | |
1154 lea r1, [r0+2*r2] | |
1155 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1156 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1157 | |
1158 RET | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1159 %endmacro |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1160 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1161 VP8_IDCT_ADD mmx |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1162 VP8_IDCT_ADD sse |
12013 | 1163 |
1164 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1165 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1166 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1167 |
12209 | 1168 %macro SCATTER_WHT 3 |
1169 movd r1d, m%1 | |
1170 movd r2d, m%2 | |
1171 mov [r0+2*16*(0+%3)], r1w | |
1172 mov [r0+2*16*(1+%3)], r2w | |
1173 shr r1d, 16 | |
1174 shr r2d, 16 | |
1175 psrlq m%1, 32 | |
1176 psrlq m%2, 32 | |
1177 mov [r0+2*16*(4+%3)], r1w | |
1178 mov [r0+2*16*(5+%3)], r2w | |
1179 movd r1d, m%1 | |
1180 movd r2d, m%2 | |
1181 mov [r0+2*16*(8+%3)], r1w | |
1182 mov [r0+2*16*(9+%3)], r2w | |
1183 shr r1d, 16 | |
1184 shr r2d, 16 | |
1185 mov [r0+2*16*(12+%3)], r1w | |
1186 mov [r0+2*16*(13+%3)], r2w | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1187 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1188 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1189 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1190 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1191 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1192 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1193 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1194 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1195 INIT_MMX |
12209 | 1196 cglobal vp8_luma_dc_wht_mmx, 2,3 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1197 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1198 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1199 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1200 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1201 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1202 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1203 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1204 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1205 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1206 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1207 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1208 psraw m3, 3 |
12209 | 1209 SCATTER_WHT 0, 1, 0 |
1210 SCATTER_WHT 2, 3, 2 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1211 RET |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1212 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1213 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1214 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1215 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1216 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1217 ; macro called with 7 mm register indexes as argument, and 4 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1218 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1219 ; first 4 mm registers will carry the transposed pixel data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1220 ; the other three are scratchspace (one would be sufficient, but this allows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1221 ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1222 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1223 ; first two regular registers are buf+4*stride and buf+5*stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1224 ; third is -stride, fourth is +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1225 %macro READ_8x4_INTERLEAVED 11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1226 ; interleave 8 (A-H) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1227 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1228 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1229 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1230 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1231 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1232 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1233 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1234 punpcklbw m%1, m%5 ; A/B interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1235 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1236 punpcklbw m%2, m%6 ; C/D interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1237 punpcklbw m%3, m%7 ; E/F interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1238 punpcklbw m%4, m%5 ; G/H interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1239 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1240 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1241 ; macro called with 7 mm register indexes as argument, and 5 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1242 ; first 11 mean the same as READ_8x4_TRANSPOSED above |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1243 ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1244 ; will be set to second regular register + 8*stride at the end |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1245 %macro READ_16x4_INTERLEAVED 12 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1246 ; transpose 16 (A-P) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1247 lea %12, [r0+8*r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1248 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1249 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1250 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1251 movd m%3, [%12+%10*4] ; I0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1252 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1253 movd m%4, [%12+%10*2] ; K0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1254 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1255 movd m%5, [%12+%10] ; L0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1256 movd m%7, [%12] ; M0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1257 add %12, %11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1258 punpcklbw m%1, m%3 ; A/I |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1259 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1260 punpcklbw m%2, m%4 ; C/K |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1261 punpcklbw m%6, m%5 ; D/L |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1262 punpcklbw m%3, m%7 ; E/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1263 punpcklbw m%2, m%6 ; C/D/K/L interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1264 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1265 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1266 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1267 movd m%4, [%12+%10*4] ; J0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1268 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1269 movd m%6, [%12] ; N0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1270 punpcklbw m%5, m%4 ; B/J |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1271 punpcklbw m%7, m%6 ; F/N |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1272 punpcklbw m%1, m%5 ; A/B/I/J interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1273 punpcklbw m%3, m%7 ; E/F/M/N interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1274 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1275 movd m%6, [%12+%11] ; O0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1276 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1277 movd m%7, [%12+%11*2] ; P0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1278 punpcklbw m%4, m%6 ; G/O |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1279 punpcklbw m%5, m%7 ; H/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1280 punpcklbw m%4, m%5 ; G/H/O/P interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1281 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1282 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1283 ; write 4 mm registers of 2 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1284 ; first four arguments are mm register indexes containing source data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1285 ; last four are registers containing buf+4*stride, buf+5*stride, |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1286 ; -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1287 %macro WRITE_4x2D 8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1288 ; write out (2 dwords per register) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1289 movd [%5+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1290 movd [%5+%7*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1291 movd [%5], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1292 movd [%6+%8], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1293 punpckhdq m%1, m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1294 punpckhdq m%2, m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1295 punpckhdq m%3, m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1296 punpckhdq m%4, m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1297 movd [%6+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1298 movd [%5+%7], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1299 movd [%6], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1300 movd [%6+%8*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1301 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1302 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1303 ; write 4 xmm registers of 4 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1304 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1305 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1306 ; we add 1*stride to the third regular registry in the process |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1307 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1308 ; same memory region), or 8 if they cover two separate buffers (third one points to |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1309 ; a different memory region than the first two), allowing for more optimal code for |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1310 ; the 16-width case |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1311 %macro WRITE_4x4D 10 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1312 ; write out (4 dwords per register), start with dwords zero |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1313 movd [%5+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1314 movd [%5], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1315 movd [%7+%8*4], m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1316 movd [%7], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1317 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1318 ; store dwords 1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1319 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1320 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1321 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1322 psrldq m%4, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1323 movd [%6+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1324 movd [%6], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1325 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1326 movd [%6+%9*4], m%3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1327 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1328 movd [%7+%9], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1329 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1330 ; write dwords 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1331 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1332 psrldq m%2, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1333 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1334 movd [%5+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1335 movd %5, m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1336 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1337 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1338 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1339 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1340 movd [%5+%8*2], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1341 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1342 movd [%6+%9], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1343 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1344 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1345 add %7, %9 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1346 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1347 ; store dwords 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1348 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1349 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1350 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1351 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1352 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1353 mov [%7+%8*4], %5d |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1354 movd [%6+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1355 %else |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1356 movd [%5+%8], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1357 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1358 movd [%6+%9*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1359 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1360 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1361 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1362 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1363 %macro SPLATB_REG_MMX 2-3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1364 movd %1, %2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1365 punpcklbw %1, %1 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1366 punpcklwd %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1367 punpckldq %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1368 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1369 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1370 %macro SPLATB_REG_MMXEXT 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1371 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1372 punpcklbw %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1373 pshufw %1, %1, 0x0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1374 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1375 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1376 %macro SPLATB_REG_SSE2 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1377 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1378 punpcklbw %1, %1 |
12210 | 1379 pshuflw %1, %1, 0x0 |
1380 punpcklqdq %1, %1 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1381 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1382 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1383 %macro SPLATB_REG_SSSE3 3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1384 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1385 pshufb %1, %3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1386 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1387 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1388 %macro SIMPLE_LOOPFILTER 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1389 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1390 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1391 mov r5, rsp ; backup stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1392 and rsp, ~(mmsize-1) ; align stack |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1393 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1394 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1395 mov r3, 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1396 %endif |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1397 %ifnidn %1, sse2 && mmsize == 16 |
12210 | 1398 pxor m0, m0 |
1399 %endif | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1400 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1401 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1402 ; set up indexes to address 4 rows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1403 mov r2, r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1404 neg r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1405 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1406 lea r0, [r0+4*r2-2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1407 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1408 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1409 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1410 %if mmsize == 8 ; mmx / mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1411 .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1412 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1413 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1414 ; read 4 half/full rows of pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1415 mova m0, [r0+r1*2] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1416 mova m1, [r0+r1] ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1417 mova m2, [r0] ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1418 mova m3, [r0+r2] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1419 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1420 lea r4, [r0+r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1421 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1422 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1423 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1424 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1425 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1426 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1427 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1428 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1429 mova [rsp], m0 ; store p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1430 mova [rsp+mmsize], m3 ; store q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1431 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1432 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1433 ; simple_limit |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1434 mova m5, m2 ; m5=backup of q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1435 mova m6, m1 ; m6=backup of p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1436 psubusb m1, m2 ; p0-q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1437 psubusb m2, m6 ; q0-p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1438 por m1, m2 ; FFABS(p0-q0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1439 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1440 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1441 mova m4, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1442 mova m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1443 psubusb m3, m0 ; q1-p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1444 psubusb m0, m4 ; p1-q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1445 por m3, m0 ; FFABS(p1-q1) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1446 mova m0, [pb_80] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1447 pxor m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1448 pxor m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1449 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1450 pand m3, [pb_FE] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1451 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1452 paddusb m3, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1453 psubusb m3, m7 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1454 pxor m1, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1455 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1456 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1457 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1458 mova m4, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1459 pxor m5, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1460 pxor m0, m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1461 psubsb m5, m0 ; q0-p0 (signed) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1462 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1463 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1464 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1465 pand m2, m3 ; apply filter mask (m3) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1466 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1467 mova m3, [pb_F8] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1468 mova m1, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1469 paddsb m2, [pb_4] ; f1<<3=a+4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1470 paddsb m1, [pb_3] ; f2<<3=a+3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1471 pand m2, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1472 pand m1, m3 ; cache f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1473 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1474 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1475 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1476 pcmpgtb m0, m2 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1477 psubb m3, m2 ; -f1<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1478 psrlq m2, 3 ; +f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1479 psrlq m3, 3 ; -f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1480 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1481 pandn m0, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1482 psubusb m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1483 paddusb m4, m3 ; q0-f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1484 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1485 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1486 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1487 pcmpgtb m0, m1 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1488 psubb m3, m1 ; -f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1489 psrlq m1, 3 ; +f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1490 psrlq m3, 3 ; -f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1491 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1492 pandn m0, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1493 paddusb m6, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1494 psubusb m6, m3 ; p0+f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1495 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1496 ; store |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1497 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1498 mova [r0], m4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1499 mova [r0+r1], m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1500 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1501 mova m0, [rsp] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1502 SWAP 2, 4 ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1503 SWAP 1, 6 ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1504 mova m3, [rsp+mmsize] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1505 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1506 TRANSPOSE4x4B 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1507 %if mmsize == 16 ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1508 add r3, r1 ; change from r4*8*stride to r0+8*stride |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1509 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1510 %else ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1511 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1512 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1513 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1514 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1515 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1516 ; next 8 pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1517 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1518 add r0, 8 ; advance 8 cols = pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1519 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1520 lea r0, [r0+r2*8] ; advance 8 rows = lines |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1521 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1522 dec r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1523 jg .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1524 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1525 REP_RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1526 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1527 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1528 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1529 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1530 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1531 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1532 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1533 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1534 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1535 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1536 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1537 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1538 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1539 %define SPLATB_REG SPLATB_REG_MMX |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1540 SIMPLE_LOOPFILTER mmx, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1541 SIMPLE_LOOPFILTER mmx, h, 6 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1542 %define SPLATB_REG SPLATB_REG_MMXEXT |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1543 SIMPLE_LOOPFILTER mmxext, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1544 SIMPLE_LOOPFILTER mmxext, h, 6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1545 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1546 %define SPLATB_REG SPLATB_REG_SSE2 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1547 SIMPLE_LOOPFILTER sse2, v, 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1548 SIMPLE_LOOPFILTER sse2, h, 6 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1549 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 1550 SIMPLE_LOOPFILTER ssse3, v, 3 |
1551 SIMPLE_LOOPFILTER ssse3, h, 6 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1552 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1553 ;----------------------------------------------------------------------------- |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1554 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1555 ; int flimE, int flimI, int hev_thr); |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1556 ;----------------------------------------------------------------------------- |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1557 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1558 %macro INNER_LOOPFILTER 5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1559 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1560 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1561 %define dst8_reg r1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1562 %define mstride_reg r2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1563 %define E_reg r3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1564 %define I_reg r4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1565 %define hev_thr_reg r5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1566 %else ; luma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1567 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1568 %define mstride_reg r1 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1569 %define E_reg r2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1570 %define I_reg r3 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1571 %define hev_thr_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1572 %ifdef m8 ; x86-64, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1573 %define dst8_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1574 %elif mmsize == 16 ; x86-32, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1575 %define dst8_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1576 %else ; x86-32, mmx/mmxext |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1577 %define cnt_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1578 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1579 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1580 %define dst_reg r0 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1581 %define stride_reg E_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1582 %define dst2_reg I_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1583 %ifndef m8 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1584 %define stack_reg hev_thr_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1585 %endif |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1586 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1587 %ifnidn %1, sse2 && mmsize == 16 |
12210 | 1588 pxor m7, m7 |
1589 %endif | |
1590 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1591 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1592 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1593 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1594 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1595 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1596 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1597 ; align stack |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1598 mov stack_reg, rsp ; backup stack pointer |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1599 and rsp, ~(mmsize-1) ; align stack |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1600 %ifidn %2, v |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1601 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1602 ; [3]=hev() result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1603 %else ; h |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1604 sub rsp, mmsize * 5 ; extra storage space for transposes |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1605 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1606 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1607 %define flim_E [rsp] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1608 %define flim_I [rsp+mmsize] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1609 %define hev_thr [rsp+mmsize*2] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1610 %define mask_res [rsp+mmsize*3] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1611 %define p0backup [rsp+mmsize*3] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1612 %define q0backup [rsp+mmsize*4] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1613 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1614 mova flim_E, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1615 mova flim_I, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1616 mova hev_thr, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1617 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1618 %else ; sse2 on x86-64 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1619 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1620 %define flim_E m9 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1621 %define flim_I m10 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1622 %define hev_thr m11 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1623 %define mask_res m12 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1624 %define p0backup m12 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1625 %define q0backup m8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1626 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1627 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1628 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1629 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1630 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1631 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1632 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1633 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1634 mov cnt_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1635 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1636 mov stride_reg, mstride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1637 neg mstride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1638 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1639 lea dst_reg, [dst_reg + stride_reg*4-4] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1640 %if %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1641 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1642 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1643 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1644 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1645 %if mmsize == 8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1646 .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1647 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1648 ; read |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1649 lea dst2_reg, [dst_reg + stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1650 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1651 %if %4 == 8 && mmsize == 16 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1652 %define movrow movh |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1653 %else |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1654 %define movrow mova |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1655 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1656 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1657 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1658 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1659 movrow m5, [dst2_reg] ; q1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1660 movrow m6, [dst2_reg+ stride_reg] ; q2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1661 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1662 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1663 movhps m0, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1664 movhps m2, [dst8_reg+mstride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1665 add dst8_reg, stride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1666 movhps m1, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1667 movhps m5, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1668 movhps m6, [dst8_reg+ stride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1669 movhps m7, [dst8_reg+ stride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1670 add dst8_reg, mstride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1671 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1672 %elif mmsize == 8 ; mmx/mmxext (h) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1673 ; read 8 rows of 8px each |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1674 movu m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1675 movu m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1676 movu m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1677 movu m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1678 movu m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1679 movu m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1680 movu m6, [dst2_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1681 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1682 ; 8x8 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1683 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1684 mova q0backup, m1 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1685 movu m7, [dst2_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1686 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1687 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1688 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1689 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1690 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1691 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1692 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1693 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1694 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1695 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1696 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1697 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1698 %else ; sse2 (h) |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1699 %if %4 == 16 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1700 lea dst8_reg, [dst_reg + stride_reg*8] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1701 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1702 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1703 ; read 16 rows of 8px each, interleave |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1704 movh m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1705 movh m1, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1706 movh m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1707 movh m5, [dst8_reg+mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1708 movh m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1709 movh m6, [dst8_reg+mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1710 movh m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1711 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1712 punpcklbw m0, m1 ; A/I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1713 punpcklbw m2, m5 ; C/K |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1714 punpcklbw m3, m6 ; D/L |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1715 punpcklbw m4, m7 ; E/M |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1716 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1717 add dst8_reg, stride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1718 movh m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1719 movh m6, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1720 movh m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1721 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1722 punpcklbw m1, m6 ; B/J |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1723 punpcklbw m5, m7 ; F/N |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1724 movh m6, [dst2_reg+ stride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1725 movh m7, [dst8_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1726 punpcklbw m6, m7 ; G/O |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1727 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1728 ; 8x16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1729 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1730 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1731 SWAP 1, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1732 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1733 mova q0backup, m1 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1734 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1735 movh m7, [dst2_reg+ stride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1736 movh m1, [dst8_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1737 punpcklbw m7, m1 ; H/P |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1738 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1739 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1740 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1741 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1742 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1743 SWAP 1, 8 |
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1744 SWAP 2, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1745 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1746 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1747 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1748 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1749 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1750 %ifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1751 SWAP 5, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1752 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1753 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1754 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1755 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1756 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1757 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1758 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1759 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1760 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1761 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1762 mova m4, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1763 SWAP 4, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1764 psubusb m4, m0 ; p2-p3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1765 psubusb m0, m1 ; p3-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1766 por m0, m4 ; abs(p3-p2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1767 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1768 mova m4, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1769 SWAP 4, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1770 psubusb m4, m1 ; p1-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1771 psubusb m1, m2 ; p2-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1772 por m1, m4 ; abs(p2-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1773 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1774 mova m4, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1775 SWAP 4, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1776 psubusb m4, m7 ; q2-q3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1777 psubusb m7, m6 ; q3-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1778 por m7, m4 ; abs(q3-q2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1779 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1780 mova m4, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1781 SWAP 4, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1782 psubusb m4, m6 ; q1-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1783 psubusb m6, m5 ; q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1784 por m6, m4 ; abs(q2-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1785 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1786 %ifidn %1, mmx |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1787 mova m4, flim_I |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1788 pxor m3, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1789 psubusb m0, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1790 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1791 psubusb m7, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1792 psubusb m6, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1793 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1794 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1795 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1796 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1797 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1798 pand m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1799 pand m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1800 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1801 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1802 pmaxub m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1803 pmaxub m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1804 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1805 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1806 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1807 SWAP 7, 3 ; now m7 is zero |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1808 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1809 movrow m3, [dst_reg +mstride_reg] ; p0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1810 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1811 movhps m3, [dst8_reg+mstride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1812 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1813 %elifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1814 SWAP 3, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1815 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1816 mova m3, p0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1817 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1818 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1819 mova m1, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1820 SWAP 1, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1821 mova m6, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1822 SWAP 3, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1823 psubusb m1, m3 ; p1-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1824 psubusb m6, m2 ; p0-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1825 por m1, m6 ; abs(p1-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1826 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1827 mova m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1828 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1829 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1830 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1831 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1832 pand m0, m1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1833 mova mask_res, m6 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1834 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1835 pmaxub m0, m1 ; max_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1836 SWAP 1, 4 ; max_hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1837 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1838 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1839 SWAP 6, 4 ; now m6 is I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1840 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1841 movrow m4, [dst_reg] ; q0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1842 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1843 movhps m4, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1844 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1845 %elifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1846 SWAP 4, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1847 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1848 mova m4, q0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1849 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1850 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1851 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1852 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1853 SWAP 7, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1854 psubusb m1, m5 ; q0-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1855 psubusb m7, m4 ; q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1856 por m1, m7 ; abs(q1-q0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1857 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1858 mova m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1859 psubusb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1860 psubusb m7, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1861 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1862 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1863 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1864 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1865 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1866 pand m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1867 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1868 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1869 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1870 pmaxub m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1871 psubusb m0, flim_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1872 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1873 pcmpeqb m0, m7 ; max(abs(..)) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1874 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1875 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1876 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1877 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1878 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1879 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1880 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1881 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1882 ; simple_limit |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1883 mova m1, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1884 SWAP 1, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1885 mova m6, m4 ; keep copies of p0/q0 around for later use |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1886 SWAP 6, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1887 psubusb m1, m4 ; p0-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1888 psubusb m6, m3 ; q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1889 por m1, m6 ; abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1890 paddusb m1, m1 ; m1=2*abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1891 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1892 mova m7, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1893 SWAP 7, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1894 mova m6, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1895 SWAP 6, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1896 psubusb m7, m5 ; p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1897 psubusb m6, m2 ; q1-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1898 por m7, m6 ; abs(q1-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1899 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1900 pand m7, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1901 psrlq m7, 1 ; abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1902 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1903 psubusb m7, flim_E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1904 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1905 pand m0, m7 ; normal_limit result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1906 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1907 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1908 %ifdef m8 ; x86-64 && sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1909 mova m8, [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1910 %define pb_80_var m8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1911 %else ; x86-32 or mmx/mmxext |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1912 %define pb_80_var [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1913 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1914 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1915 mova m7, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1916 pxor m1, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1917 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1918 psubsb m1, m7 ; (signed) q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1919 mova m6, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1920 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1921 pxor m6, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1922 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1923 psubsb m6, m7 ; (signed) p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1924 mova m7, mask_res |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1925 pandn m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1926 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1927 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1928 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1929 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1930 pand m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1931 mova m1, [pb_F8] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1932 mova m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1933 paddsb m7, [pb_3] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1934 paddsb m6, [pb_4] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1935 pand m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1936 pand m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1937 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1938 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1939 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1940 pcmpgtb m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1941 psubb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1942 psrlq m7, 3 ; +f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1943 psrlq m0, 3 ; -f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1944 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1945 pandn m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1946 psubusb m3, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1947 paddusb m3, m1 ; p0+f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1948 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1949 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1950 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1951 pcmpgtb m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1952 psubb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1953 psrlq m6, 3 ; +f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1954 psrlq m1, 3 ; -f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1955 pand m1, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1956 pandn m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1957 psubusb m4, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1958 paddusb m4, m1 ; q0-f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1959 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1960 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1961 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1962 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1963 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1964 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1965 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1966 mova m7, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1967 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1968 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1969 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1970 pand m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1971 pand m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1972 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1973 paddusb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1974 pand m1, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1975 pandn m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1976 psrlq m1, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1977 psrlq m7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1978 SWAP 0, 7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1979 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1980 psubusb m1, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1981 pavgb m0, m7 ; a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1982 pavgb m1, m7 ; -a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1983 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1984 psubusb m5, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1985 psubusb m2, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1986 paddusb m5, m1 ; q1-a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1987 paddusb m2, m0 ; p1+a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1988 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1989 ; store |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1990 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1991 movrow [dst_reg +mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1992 movrow [dst_reg +mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1993 movrow [dst_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1994 movrow [dst_reg + stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1995 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1996 movhps [dst8_reg+mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1997 movhps [dst8_reg+mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1998 movhps [dst8_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1999 movhps [dst8_reg+ stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2000 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2001 %else ; h |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2002 add dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2003 add dst2_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2004 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2005 ; 4x8/16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2006 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2007 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2008 %if mmsize == 8 ; mmx/mmxext (h) |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2009 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2010 %else ; sse2 (h) |
12180 | 2011 lea dst8_reg, [dst8_reg+mstride_reg+2] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2012 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2013 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2014 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2015 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2016 %if mmsize == 8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2017 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2018 %ifidn %2, h |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2019 sub dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2020 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2021 cmp dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2022 mov dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2023 jnz .next8px |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2024 %else |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2025 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2026 lea dst_reg, [dst_reg + stride_reg*8-2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2027 %else ; v |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2028 add dst_reg, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2029 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2030 dec cnt_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2031 jg .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2032 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2033 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2034 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2035 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2036 mov rsp, stack_reg ; restore stack pointer |
12173
c47ddb7df424
Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents:
12168
diff
changeset
|
2037 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2038 RET |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2039 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2040 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2041 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2042 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2043 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
2044 INNER_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2045 INNER_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2046 INNER_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2047 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2048 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2049 INNER_LOOPFILTER mmxext, v, 6, 16, 0 |
2050 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | |
2051 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | |
2052 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2053 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2054 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2055 %define SPLATB_REG SPLATB_REG_SSE2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2056 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2057 %ifdef m8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2058 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2059 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2060 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2061 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2062 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2063 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2064 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2065 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 2066 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
2067 %ifdef m8 | |
2068 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | |
2069 %else | |
2070 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | |
2071 %endif | |
2072 INNER_LOOPFILTER ssse3, v, 6, 8, 13 | |
2073 INNER_LOOPFILTER ssse3, h, 6, 8, 13 | |
2074 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2075 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2076 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2077 ; int flimE, int flimI, int hev_thr); |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2078 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2079 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2080 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2081 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2082 ; for pre-SSE4: |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2083 ; 3 is a general-purpose register that we will clobber |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2084 ; for SSE4: |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2085 ; 3 is a pointer to the destination's 5th line |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2086 ; 4 is a pointer to the destination's 4th line |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2087 ; 5/6 is -stride and +stride |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2088 ; 7 is optimization string |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2089 %macro WRITE_8W 7 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2090 %ifidn %7, sse4 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2091 pextrw [%4+%5*4], %1, 0 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2092 pextrw [%3+%5*4], %1, 1 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2093 pextrw [%4+%5*2], %1, 2 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2094 pextrw [%4+%5 ], %1, 3 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2095 pextrw [%4 ], %1, 4 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2096 pextrw [%3 ], %1, 5 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2097 pextrw [%3+%6 ], %1, 6 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2098 pextrw [%3+%6*2], %1, 7 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2099 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2100 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2101 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2102 punpckhdq %1, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2103 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2104 psrldq %1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2105 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2106 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2107 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2108 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2109 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2110 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2111 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2112 %if mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2113 psrldq %1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2114 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2115 add %4, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2116 mov [%4+%5*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2117 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2118 mov [%4+%5 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2119 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2120 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2121 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2122 punpckhdq %2, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2123 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2124 psrldq %2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2125 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2126 mov [%4 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2127 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2128 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2129 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2130 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2131 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2132 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2133 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2134 mov [%4+%6*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2135 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2136 add %4, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2137 %endif |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2138 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2139 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2140 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2141 %macro MBEDGE_LOOPFILTER 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2142 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2143 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2144 %define dst8_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2145 %define mstride_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2146 %define E_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2147 %define I_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2148 %define hev_thr_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2149 %else ; luma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2150 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2151 %define mstride_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2152 %define E_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2153 %define I_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2154 %define hev_thr_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2155 %ifdef m8 ; x86-64, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2156 %define dst8_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2157 %elif mmsize == 16 ; x86-32, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2158 %define dst8_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2159 %else ; x86-32, mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2160 %define cnt_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2161 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2162 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2163 %define dst_reg r0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2164 %define stride_reg E_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2165 %define dst2_reg I_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2166 %ifndef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2167 %define stack_reg hev_thr_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2168 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2169 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2170 %ifnidn %1, sse2 && mmsize == 16 |
12210 | 2171 pxor m7, m7 |
2172 %endif | |
2173 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2174 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2175 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2176 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2177 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2178 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2179 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2180 ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2181 mov stack_reg, rsp ; backup stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2182 and rsp, ~(mmsize-1) ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2183 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2184 ; [3]=hev() result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2185 ; [4]=filter tmp result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2186 ; [5]/[6] = p2/q2 backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2187 ; [7]=lim_res sign result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2188 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2189 %define flim_E [rsp] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2190 %define flim_I [rsp+mmsize] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2191 %define hev_thr [rsp+mmsize*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2192 %define mask_res [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2193 %define lim_res [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2194 %define p0backup [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2195 %define q0backup [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2196 %define p2backup [rsp+mmsize*5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2197 %define q2backup [rsp+mmsize*6] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2198 %define lim_sign [rsp+mmsize*7] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2199 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2200 mova flim_E, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2201 mova flim_I, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2202 mova hev_thr, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2203 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2204 %else ; sse2 on x86-64 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2205 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2206 %define flim_E m9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2207 %define flim_I m10 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2208 %define hev_thr m11 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2209 %define mask_res m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2210 %define lim_res m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2211 %define p0backup m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2212 %define q0backup m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2213 %define p2backup m13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2214 %define q2backup m14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2215 %define lim_sign m15 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2216 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2217 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2218 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2219 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2220 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2221 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2222 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2223 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2224 mov cnt_reg, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2225 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2226 mov stride_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2227 neg mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2228 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2229 lea dst_reg, [dst_reg + stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2230 %if %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2231 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2232 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2233 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2234 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2235 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2236 .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2237 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2238 ; read |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2239 lea dst2_reg, [dst_reg + stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2240 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2241 %if %4 == 8 && mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2242 %define movrow movh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2243 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2244 %define movrow mova |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2245 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2246 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2247 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2248 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2249 movrow m5, [dst2_reg] ; q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2250 movrow m6, [dst2_reg+ stride_reg] ; q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2251 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2252 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2253 movhps m0, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2254 movhps m2, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2255 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2256 movhps m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2257 movhps m5, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2258 movhps m6, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2259 movhps m7, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2260 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2261 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2262 %elif mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2263 ; read 8 rows of 8px each |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2264 movu m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2265 movu m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2266 movu m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2267 movu m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2268 movu m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2269 movu m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2270 movu m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2271 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2272 ; 8x8 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2273 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2274 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2275 movu m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2276 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2277 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2278 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2279 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2280 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2281 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2282 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2283 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2284 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2285 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2286 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2287 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2288 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2289 %if %4 == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2290 lea dst8_reg, [dst_reg + stride_reg*8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2291 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2292 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2293 ; read 16 rows of 8px each, interleave |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2294 movh m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2295 movh m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2296 movh m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2297 movh m5, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2298 movh m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2299 movh m6, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2300 movh m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2301 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2302 punpcklbw m0, m1 ; A/I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2303 punpcklbw m2, m5 ; C/K |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2304 punpcklbw m3, m6 ; D/L |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2305 punpcklbw m4, m7 ; E/M |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2306 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2307 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2308 movh m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2309 movh m6, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2310 movh m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2311 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2312 punpcklbw m1, m6 ; B/J |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2313 punpcklbw m5, m7 ; F/N |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2314 movh m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2315 movh m7, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2316 punpcklbw m6, m7 ; G/O |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2317 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2318 ; 8x16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2319 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2320 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2321 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2322 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2323 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2324 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2325 movh m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2326 movh m1, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2327 punpcklbw m7, m1 ; H/P |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2328 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2329 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2330 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2331 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2332 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2333 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2334 SWAP 2, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2335 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2336 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2337 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2338 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2339 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2340 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2341 SWAP 5, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2342 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2343 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2344 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2345 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2346 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2347 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2348 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2349 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2350 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2351 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2352 mova m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2353 SWAP 4, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2354 psubusb m4, m0 ; p2-p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2355 psubusb m0, m1 ; p3-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2356 por m0, m4 ; abs(p3-p2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2357 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2358 mova m4, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2359 SWAP 4, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2360 psubusb m4, m1 ; p1-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2361 mova p2backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2362 psubusb m1, m2 ; p2-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2363 por m1, m4 ; abs(p2-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2364 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2365 mova m4, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2366 SWAP 4, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2367 psubusb m4, m7 ; q2-q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2368 psubusb m7, m6 ; q3-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2369 por m7, m4 ; abs(q3-q2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2370 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2371 mova m4, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2372 SWAP 4, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2373 psubusb m4, m6 ; q1-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2374 mova q2backup, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2375 psubusb m6, m5 ; q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2376 por m6, m4 ; abs(q2-q1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2377 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2378 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2379 mova m4, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2380 pxor m3, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2381 psubusb m0, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2382 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2383 psubusb m7, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2384 psubusb m6, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2385 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2386 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2387 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2388 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2389 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2390 pand m7, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2391 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2392 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2393 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2394 pmaxub m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2395 pmaxub m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2396 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2397 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2398 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2399 SWAP 7, 3 ; now m7 is zero |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2400 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2401 movrow m3, [dst_reg +mstride_reg] ; p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2402 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2403 movhps m3, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2404 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2405 %elifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2406 SWAP 3, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2407 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2408 mova m3, p0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2409 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2410 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2411 mova m1, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2412 SWAP 1, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2413 mova m6, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2414 SWAP 3, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2415 psubusb m1, m3 ; p1-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2416 psubusb m6, m2 ; p0-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2417 por m1, m6 ; abs(p1-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2418 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2419 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2420 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2421 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2422 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2423 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2424 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2425 mova mask_res, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2426 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2427 pmaxub m0, m1 ; max_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2428 SWAP 1, 4 ; max_hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2429 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2430 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2431 SWAP 6, 4 ; now m6 is I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2432 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2433 movrow m4, [dst_reg] ; q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2434 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2435 movhps m4, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2436 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2437 %elifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2438 SWAP 4, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2439 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2440 mova m4, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2441 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2442 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2443 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2444 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2445 SWAP 7, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2446 psubusb m1, m5 ; q0-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2447 psubusb m7, m4 ; q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2448 por m1, m7 ; abs(q1-q0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2449 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2450 mova m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2451 psubusb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2452 psubusb m7, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2453 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2454 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2455 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2456 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2457 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2458 pand m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2459 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2460 pxor m7, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2461 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2462 pmaxub m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2463 psubusb m0, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2464 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2465 pcmpeqb m0, m7 ; max(abs(..)) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2466 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2467 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2468 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2469 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2470 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2471 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2472 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2473 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2474 ; simple_limit |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2475 mova m1, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2476 SWAP 1, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2477 mova m6, m4 ; keep copies of p0/q0 around for later use |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2478 SWAP 6, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2479 psubusb m1, m4 ; p0-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2480 psubusb m6, m3 ; q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2481 por m1, m6 ; abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2482 paddusb m1, m1 ; m1=2*abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2483 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2484 mova m7, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2485 SWAP 7, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2486 mova m6, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2487 SWAP 6, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2488 psubusb m7, m5 ; p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2489 psubusb m6, m2 ; q1-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2490 por m7, m6 ; abs(q1-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2491 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2492 pand m7, [pb_FE] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2493 psrlq m7, 1 ; abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2494 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2495 psubusb m7, flim_E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2496 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2497 pand m0, m7 ; normal_limit result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2498 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2499 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2500 %ifdef m8 ; x86-64 && sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2501 mova m8, [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2502 %define pb_80_var m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2503 %else ; x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2504 %define pb_80_var [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2505 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2506 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2507 mova m7, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2508 pxor m1, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2509 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2510 psubsb m1, m7 ; (signed) q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2511 mova m6, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2512 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2513 pxor m6, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2514 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2515 psubsb m6, m7 ; (signed) p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2516 mova m7, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2517 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2518 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2519 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2520 pand m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2521 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2522 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2523 pand lim_res, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2524 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2525 mova m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2526 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2527 mova lim_res, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2528 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2529 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2530 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2531 mova m1, [pb_F8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2532 mova m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2533 paddsb m7, [pb_3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2534 paddsb m6, [pb_4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2535 pand m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2536 pand m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2537 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2538 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2539 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2540 pcmpgtb m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2541 psubb m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2542 psrlq m7, 3 ; +f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2543 psrlq m0, 3 ; -f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2544 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2545 pandn m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2546 psubusb m3, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2547 paddusb m3, m1 ; p0+f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2548 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2549 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2550 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2551 pcmpgtb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2552 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2553 psrlq m6, 3 ; +f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2554 psrlq m1, 3 ; -f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2555 pand m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2556 pandn m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2557 psubusb m4, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2558 paddusb m4, m1 ; q0-f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2559 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2560 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2561 mova m7, [pw_63] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2562 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2563 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2564 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2565 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2566 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2567 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2568 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2569 pcmpgtb m0, m1 ; which are negative |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2570 punpcklbw m6, m0 ; signed byte->word |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2571 punpckhbw m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2572 mova lim_sign, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2573 mova mask_res, m6 ; backup for later in filter |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2574 mova lim_res, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2575 pmullw m6, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2576 pmullw m1, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2577 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2578 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2579 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2580 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2581 packsswb m6, m1 ; a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2582 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2583 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2584 pand m1, m0 ; -a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2585 pandn m0, m6 ; +a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2586 psubusb m3, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2587 paddusb m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2588 paddusb m3, m0 ; p0+a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2589 psubusb m4, m0 ; q0-a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2590 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2591 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2592 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2593 mova m0, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2594 pmullw m6, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2595 pmullw m1, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2596 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2597 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2598 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2599 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2600 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2601 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2602 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2603 pand m1, m0 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2604 pandn m0, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2605 psubusb m2, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2606 paddusb m5, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2607 paddusb m2, m0 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2608 psubusb m5, m0 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2609 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2610 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2611 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2612 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2613 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2614 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2615 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2616 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2617 pmullw m6, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2618 pmullw m1, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2619 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2620 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2621 %ifdef m15 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2622 SWAP 7, 15 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2623 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2624 mova m7, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2625 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2626 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2627 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2628 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2629 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2630 psubb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2631 pand m0, m7 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2632 pandn m7, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2633 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2634 SWAP 1, 13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2635 SWAP 6, 14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2636 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2637 mova m1, p2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2638 mova m6, q2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2639 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2640 psubusb m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2641 paddusb m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2642 paddusb m1, m7 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2643 psubusb m6, m7 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2644 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2645 ; store |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2646 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2647 movrow [dst2_reg+mstride_reg*4], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2648 movrow [dst_reg +mstride_reg*2], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2649 movrow [dst_reg +mstride_reg ], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2650 movrow [dst_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2651 movrow [dst2_reg], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2652 movrow [dst2_reg+ stride_reg ], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2653 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2654 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2655 movhps [dst8_reg+mstride_reg*2], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2656 movhps [dst8_reg+mstride_reg ], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2657 movhps [dst8_reg], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2658 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2659 movhps [dst8_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2660 movhps [dst8_reg+ stride_reg ], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2661 movhps [dst8_reg+ stride_reg*2], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2662 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2663 %else ; h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2664 inc dst_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2665 inc dst2_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2666 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2667 ; 4x8/16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2668 TRANSPOSE4x4B 1, 2, 3, 4, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2669 SBUTTERFLY bw, 5, 6, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2670 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2671 %if mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2672 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2673 add dst_reg, 4 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2674 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2675 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2676 lea dst8_reg, [dst8_reg+mstride_reg+1] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2677 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12214
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2678 lea dst_reg, [dst2_reg+mstride_reg+4] |
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2679 lea dst8_reg, [dst8_reg+mstride_reg+4] |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2680 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2681 %ifidn %2, sse4 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2682 lea dst_reg, [dst8_reg+ stride_reg] |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2683 %endif |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2684 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2685 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2686 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2687 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2688 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2689 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2690 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2691 sub dst_reg, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2692 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2693 cmp dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2694 mov dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2695 jnz .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2696 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2697 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2698 lea dst_reg, [dst_reg + stride_reg*8-5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2699 %else ; v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2700 add dst_reg, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2701 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2702 dec cnt_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2703 jg .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2704 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2705 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2706 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2707 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2708 mov rsp, stack_reg ; restore stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2709 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2710 RET |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2711 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2712 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2713 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2714 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2715 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
2716 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2717 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2718 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2719 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2720 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2721 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |
2722 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | |
2723 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | |
2724 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2725 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2726 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2727 %define SPLATB_REG SPLATB_REG_SSE2 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2728 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2729 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2730 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2731 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2732 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2733 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2734 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2735 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 |
12210 | 2736 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2737 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 2738 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 |
2739 %ifdef m8 | |
2740 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 | |
2741 %else | |
2742 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 | |
2743 %endif | |
2744 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 | |
2745 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 | |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2746 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2747 %ifdef m8 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2748 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2749 %else |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2750 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2751 %endif |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2752 MBEDGE_LOOPFILTER sse4, h, 6, 8, 16 |