Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12276:1c299b8f2930 libavcodec
Enable no-loop memory/register saving for ssse3/sse4 also.
author | rbultje |
---|---|
date | Mon, 26 Jul 2010 14:07:57 +0000 |
parents | 709d5848abf8 |
children | da5b503f050d |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
11975 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
148 cextern pb_1 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
149 cextern pw_3 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
150 cextern pb_3 |
11975 | 151 cextern pw_4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
152 cextern pb_4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
153 cextern pw_9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
154 cextern pw_18 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
155 cextern pw_27 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
156 cextern pw_63 |
11975 | 157 cextern pw_64 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
158 cextern pb_80 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
159 cextern pb_F8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
160 cextern pb_FE |
11975 | 161 |
162 SECTION .text | |
163 | |
164 ;----------------------------------------------------------------------------- | |
165 ; subpel MC functions: | |
166 ; | |
167 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
168 ; uint8_t *src, int srcstride, | |
169 ; int height, int mx, int my); | |
170 ;----------------------------------------------------------------------------- | |
171 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
172 %macro FILTER_SSSE3 3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
173 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
174 lea r5d, [r5*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
175 mova m3, [filter_h6_shuf2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 mova m4, [filter_h6_shuf3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 mova m6, [sixtap_filter_hb+r5*8-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 mova m7, [sixtap_filter_hb+r5*8-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 movu m0, [r2-2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 mova m2, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 %ifidn %1, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 ; shuffle with a memory operand |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 punpcklbw m0, [r2+3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 %else |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 pshufb m0, [filter_h6_shuf1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 pshufb m1, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 pshufb m2, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 pmaddubsw m2, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 paddsw m0, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
214 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 shl r5d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 mova m2, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 mova m3, [filter_h2_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 mova m4, [filter_h4_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 mova m6, [fourtap_filter_hb+r5] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 movu m0, [r2-1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 pshufb m0, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 pshufb m1, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
245 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 shl r6d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 mova m5, [fourtap_filter_hb+r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 mova m6, [fourtap_filter_hb+r6] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 mova m7, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 ; read 3 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 movh m1, [r2+ r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 movh m2, [r2+2*r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 movh m3, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 mova m4, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 punpcklbw m4, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 punpcklbw m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 pmaddubsw m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 pmaddubsw m2, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 paddsw m4, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 paddsw m4, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 psraw m4, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 packuswb m4, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 movh [r0], m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
284 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 lea r6d, [r6*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 lea r6, [sixtap_filter_hb+r6*8] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 ; read 5 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 movh m1, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 movh m2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 lea r2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 movh m3, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 movh m4, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 movh m5, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 mova m6, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 punpcklbw m6, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 punpcklbw m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 mova m7, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 punpcklbw m7, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 pmaddubsw m6, [r6-48] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 pmaddubsw m1, [r6-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 pmaddubsw m7, [r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 paddsw m6, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 paddsw m6, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 paddsw m6, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 psraw m6, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 mova m3, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 packuswb m6, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 mova m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 movh [r0], m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
327 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
328 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
329 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
330 %endmacro |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
331 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
332 INIT_MMX |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
333 FILTER_SSSE3 4, 0, 0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
334 INIT_XMM |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
335 FILTER_SSSE3 8, 8, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
336 |
11975 | 337 ; 4x4 block, H-only 4-tap filter |
338 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
339 shl r5d, 4 | |
340 %ifdef PIC | |
341 lea r11, [fourtap_filter_hw_m] | |
342 %endif | |
343 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
344 movq mm5, [fourtap_filter_hw+r5] | |
345 movq mm7, [pw_64] | |
346 pxor mm6, mm6 | |
347 | |
348 .nextrow | |
349 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
350 | |
351 ; first set of 2 pixels | |
352 movq mm2, mm1 ; byte ABCD.. | |
353 punpcklbw mm1, mm6 ; byte->word ABCD | |
354 pshufw mm0, mm2, 9 ; byte CDEF.. | |
355 punpcklbw mm0, mm6 ; byte->word CDEF | |
356 pshufw mm3, mm1, 0x94 ; word ABBC | |
357 pshufw mm1, mm0, 0x94 ; word CDDE | |
358 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
359 movq mm0, mm1 ; backup for second set of pixels | |
360 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
361 paddd mm3, mm1 ; finish 1st 2px | |
362 | |
363 ; second set of 2 pixels, use backup of above | |
364 punpckhbw mm2, mm6 ; byte->word EFGH | |
365 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
366 pshufw mm1, mm2, 0x94 ; word EFFG | |
367 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
368 paddd mm0, mm1 ; finish 2nd 2px | |
369 | |
370 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
371 packssdw mm3, mm0 ; merge dword->word (4px) | |
372 paddsw mm3, mm7 ; rounding | |
373 psraw mm3, 7 | |
374 packuswb mm3, mm6 ; clip and word->bytes | |
375 movd [r0], mm3 ; store | |
376 | |
377 ; go to next line | |
378 add r0, r1 | |
379 add r2, r3 | |
380 dec r4 ; next row | |
381 jg .nextrow | |
382 REP_RET | |
383 | |
384 ; 4x4 block, H-only 6-tap filter | |
385 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
386 lea r5d, [r5*3] | |
387 %ifdef PIC | |
388 lea r11, [sixtap_filter_hw_m] | |
389 %endif | |
390 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
391 movq mm5, [sixtap_filter_hw+r5*8-32] | |
392 movq mm6, [sixtap_filter_hw+r5*8-16] | |
393 movq mm7, [pw_64] | |
394 pxor mm3, mm3 | |
395 | |
396 .nextrow | |
397 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
398 | |
399 ; first set of 2 pixels | |
400 movq mm2, mm1 ; byte ABCD.. | |
401 punpcklbw mm1, mm3 ; byte->word ABCD | |
402 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
403 punpckhbw mm2, mm3 ; byte->word EFGH | |
404 punpcklbw mm0, mm3 ; byte->word CDEF | |
405 pshufw mm1, mm1, 0x94 ; word ABBC | |
406 pshufw mm2, mm2, 0x94 ; word EFFG | |
407 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
408 pshufw mm3, mm0, 0x94 ; word CDDE | |
409 movq mm0, mm3 ; backup for second set of pixels | |
410 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
411 paddd mm1, mm3 ; add to 1st 2px cache | |
412 movq mm3, mm2 ; backup for second set of pixels | |
413 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
414 paddd mm1, mm2 ; finish 1st 2px | |
415 | |
416 ; second set of 2 pixels, use backup of above | |
417 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
418 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
419 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
420 paddd mm0, mm3 ; add to 2nd 2px cache | |
421 pxor mm3, mm3 | |
422 punpcklbw mm2, mm3 ; byte->word FGHI | |
423 pshufw mm2, mm2, 0xE9 ; word GHHI | |
424 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
425 paddd mm0, mm2 ; finish 2nd 2px | |
426 | |
427 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
428 packssdw mm1, mm0 ; merge dword->word (4px) | |
429 paddsw mm1, mm7 ; rounding | |
430 psraw mm1, 7 | |
431 packuswb mm1, mm3 ; clip and word->bytes | |
432 movd [r0], mm1 ; store | |
433 | |
434 ; go to next line | |
435 add r0, r1 | |
436 add r2, r3 | |
437 dec r4 ; next row | |
438 jg .nextrow | |
439 REP_RET | |
440 | |
441 ; 4x4 block, H-only 4-tap filter | |
442 INIT_XMM | |
443 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
444 shl r5d, 4 | |
445 %ifdef PIC | |
446 lea r11, [fourtap_filter_hw_m] | |
447 %endif | |
448 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
449 mova m6, [fourtap_filter_hw+r5] | |
450 pxor m7, m7 | |
451 | |
452 .nextrow | |
453 movh m0, [r2-1] | |
454 punpcklbw m0, m7 ; ABCDEFGH | |
455 mova m1, m0 | |
456 mova m2, m0 | |
457 mova m3, m0 | |
458 psrldq m1, 2 ; BCDEFGH | |
459 psrldq m2, 4 ; CDEFGH | |
460 psrldq m3, 6 ; DEFGH | |
461 punpcklwd m0, m1 ; ABBCCDDE | |
462 punpcklwd m2, m3 ; CDDEEFFG | |
463 pmaddwd m0, m5 | |
464 pmaddwd m2, m6 | |
465 paddd m0, m2 | |
466 | |
467 movh m1, [r2+3] | |
468 punpcklbw m1, m7 ; ABCDEFGH | |
469 mova m2, m1 | |
470 mova m3, m1 | |
471 mova m4, m1 | |
472 psrldq m2, 2 ; BCDEFGH | |
473 psrldq m3, 4 ; CDEFGH | |
474 psrldq m4, 6 ; DEFGH | |
475 punpcklwd m1, m2 ; ABBCCDDE | |
476 punpcklwd m3, m4 ; CDDEEFFG | |
477 pmaddwd m1, m5 | |
478 pmaddwd m3, m6 | |
479 paddd m1, m3 | |
480 | |
481 packssdw m0, m1 | |
482 paddsw m0, [pw_64] | |
483 psraw m0, 7 | |
484 packuswb m0, m7 | |
485 movh [r0], m0 ; store | |
486 | |
487 ; go to next line | |
488 add r0, r1 | |
489 add r2, r3 | |
490 dec r4 ; next row | |
491 jg .nextrow | |
492 REP_RET | |
493 | |
494 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
495 lea r5d, [r5*3] | |
496 %ifdef PIC | |
497 lea r11, [sixtap_filter_hw_m] | |
498 %endif | |
499 lea r5, [sixtap_filter_hw+r5*8] | |
500 pxor m7, m7 | |
501 | |
502 .nextrow | |
503 movu m0, [r2-2] | |
504 mova m6, m0 | |
505 mova m4, m0 | |
506 punpcklbw m0, m7 ; ABCDEFGHI | |
507 mova m1, m0 | |
508 mova m2, m0 | |
509 mova m3, m0 | |
510 psrldq m1, 2 ; BCDEFGH | |
511 psrldq m2, 4 ; CDEFGH | |
512 psrldq m3, 6 ; DEFGH | |
513 psrldq m4, 4 | |
514 punpcklbw m4, m7 ; EFGH | |
515 mova m5, m4 | |
516 psrldq m5, 2 ; FGH | |
517 punpcklwd m0, m1 ; ABBCCDDE | |
518 punpcklwd m2, m3 ; CDDEEFFG | |
519 punpcklwd m4, m5 ; EFFGGHHI | |
520 pmaddwd m0, [r5-48] | |
521 pmaddwd m2, [r5-32] | |
522 pmaddwd m4, [r5-16] | |
523 paddd m0, m2 | |
524 paddd m0, m4 | |
525 | |
526 psrldq m6, 4 | |
527 mova m4, m6 | |
528 punpcklbw m6, m7 ; ABCDEFGHI | |
529 mova m1, m6 | |
530 mova m2, m6 | |
531 mova m3, m6 | |
532 psrldq m1, 2 ; BCDEFGH | |
533 psrldq m2, 4 ; CDEFGH | |
534 psrldq m3, 6 ; DEFGH | |
535 psrldq m4, 4 | |
536 punpcklbw m4, m7 ; EFGH | |
537 mova m5, m4 | |
538 psrldq m5, 2 ; FGH | |
539 punpcklwd m6, m1 ; ABBCCDDE | |
540 punpcklwd m2, m3 ; CDDEEFFG | |
541 punpcklwd m4, m5 ; EFFGGHHI | |
542 pmaddwd m6, [r5-48] | |
543 pmaddwd m2, [r5-32] | |
544 pmaddwd m4, [r5-16] | |
545 paddd m6, m2 | |
546 paddd m6, m4 | |
547 | |
548 packssdw m0, m6 | |
549 paddsw m0, [pw_64] | |
550 psraw m0, 7 | |
551 packuswb m0, m7 | |
552 movh [r0], m0 ; store | |
553 | |
554 ; go to next line | |
555 add r0, r1 | |
556 add r2, r3 | |
557 dec r4 ; next row | |
558 jg .nextrow | |
559 REP_RET | |
560 | |
561 %macro FILTER_V 3 | |
562 ; 4x4 block, V-only 4-tap filter | |
563 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
564 shl r6d, 5 | |
565 %ifdef PIC | |
566 lea r11, [fourtap_filter_v_m] | |
567 %endif | |
568 lea r6, [fourtap_filter_v+r6-32] | |
569 mova m6, [pw_64] | |
570 pxor m7, m7 | |
571 mova m5, [r6+48] | |
572 | |
573 ; read 3 lines | |
574 sub r2, r3 | |
575 movh m0, [r2] | |
576 movh m1, [r2+ r3] | |
577 movh m2, [r2+2*r3] | |
578 add r2, r3 | |
579 punpcklbw m0, m7 | |
580 punpcklbw m1, m7 | |
581 punpcklbw m2, m7 | |
582 | |
583 .nextrow | |
584 ; first calculate negative taps (to prevent losing positive overflows) | |
585 movh m4, [r2+2*r3] ; read new row | |
586 punpcklbw m4, m7 | |
587 mova m3, m4 | |
588 pmullw m0, [r6+0] | |
589 pmullw m4, m5 | |
590 paddsw m4, m0 | |
591 | |
592 ; then calculate positive taps | |
593 mova m0, m1 | |
594 pmullw m1, [r6+16] | |
595 paddsw m4, m1 | |
596 mova m1, m2 | |
597 pmullw m2, [r6+32] | |
598 paddsw m4, m2 | |
599 mova m2, m3 | |
600 | |
601 ; round/clip/store | |
602 paddsw m4, m6 | |
603 psraw m4, 7 | |
604 packuswb m4, m7 | |
605 movh [r0], m4 | |
606 | |
607 ; go to next line | |
608 add r0, r1 | |
609 add r2, r3 | |
610 dec r4 ; next row | |
611 jg .nextrow | |
612 REP_RET | |
613 | |
614 | |
615 ; 4x4 block, V-only 6-tap filter | |
616 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
617 shl r6d, 4 | |
618 lea r6, [r6*3] | |
619 %ifdef PIC | |
620 lea r11, [sixtap_filter_v_m] | |
621 %endif | |
622 lea r6, [sixtap_filter_v+r6-96] | |
623 pxor m7, m7 | |
624 | |
625 ; read 5 lines | |
626 sub r2, r3 | |
627 sub r2, r3 | |
628 movh m0, [r2] | |
629 movh m1, [r2+r3] | |
630 movh m2, [r2+r3*2] | |
631 lea r2, [r2+r3*2] | |
632 add r2, r3 | |
633 movh m3, [r2] | |
634 movh m4, [r2+r3] | |
635 punpcklbw m0, m7 | |
636 punpcklbw m1, m7 | |
637 punpcklbw m2, m7 | |
638 punpcklbw m3, m7 | |
639 punpcklbw m4, m7 | |
640 | |
641 .nextrow | |
642 ; first calculate negative taps (to prevent losing positive overflows) | |
643 mova m5, m1 | |
644 pmullw m5, [r6+16] | |
645 mova m6, m4 | |
646 pmullw m6, [r6+64] | |
647 paddsw m6, m5 | |
648 | |
649 ; then calculate positive taps | |
650 movh m5, [r2+2*r3] ; read new row | |
651 punpcklbw m5, m7 | |
652 pmullw m0, [r6+0] | |
653 paddsw m6, m0 | |
654 mova m0, m1 | |
655 mova m1, m2 | |
656 pmullw m2, [r6+32] | |
657 paddsw m6, m2 | |
658 mova m2, m3 | |
659 pmullw m3, [r6+48] | |
660 paddsw m6, m3 | |
661 mova m3, m4 | |
662 mova m4, m5 | |
663 pmullw m5, [r6+80] | |
664 paddsw m6, m5 | |
665 | |
666 ; round/clip/store | |
667 paddsw m6, [pw_64] | |
668 psraw m6, 7 | |
669 packuswb m6, m7 | |
670 movh [r0], m6 | |
671 | |
672 ; go to next line | |
673 add r0, r1 | |
674 add r2, r3 | |
675 dec r4 ; next row | |
676 jg .nextrow | |
677 REP_RET | |
678 %endmacro | |
679 | |
680 INIT_MMX | |
681 FILTER_V mmxext, 4, 0 | |
682 INIT_XMM | |
683 FILTER_V sse2, 8, 8 | |
684 | |
11991 | 685 %macro FILTER_BILINEAR 3 |
686 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
687 mov r5d, 8*16 | |
688 shl r6d, 4 | |
689 sub r5d, r6d | |
690 %ifdef PIC | |
691 lea r11, [bilinear_filter_vw_m] | |
692 %endif | |
693 pxor m6, m6 | |
12000 | 694 mova m4, [bilinear_filter_vw+r5-16] |
695 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 696 .nextrow |
697 movh m0, [r2+r3*0] | |
698 movh m1, [r2+r3*1] | |
699 movh m3, [r2+r3*2] | |
700 punpcklbw m0, m6 | |
701 punpcklbw m1, m6 | |
702 punpcklbw m3, m6 | |
703 mova m2, m1 | |
704 pmullw m0, m4 | |
705 pmullw m1, m5 | |
706 pmullw m2, m4 | |
707 pmullw m3, m5 | |
708 paddsw m0, m1 | |
709 paddsw m2, m3 | |
710 psraw m0, 2 | |
711 psraw m2, 2 | |
712 pavgw m0, m6 | |
713 pavgw m2, m6 | |
714 %ifidn %1, mmxext | |
715 packuswb m0, m0 | |
716 packuswb m2, m2 | |
717 movh [r0+r1*0], m0 | |
718 movh [r0+r1*1], m2 | |
719 %else | |
720 packuswb m0, m2 | |
721 movh [r0+r1*0], m0 | |
722 movhps [r0+r1*1], m0 | |
723 %endif | |
724 | |
725 lea r0, [r0+r1*2] | |
726 lea r2, [r2+r3*2] | |
727 sub r4, 2 | |
728 jg .nextrow | |
729 REP_RET | |
730 | |
731 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
732 mov r6d, 8*16 | |
733 shl r5d, 4 | |
734 sub r6d, r5d | |
735 %ifdef PIC | |
736 lea r11, [bilinear_filter_vw_m] | |
737 %endif | |
738 pxor m6, m6 | |
12000 | 739 mova m4, [bilinear_filter_vw+r6-16] |
740 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 741 .nextrow |
742 movh m0, [r2+r3*0+0] | |
743 movh m1, [r2+r3*0+1] | |
744 movh m2, [r2+r3*1+0] | |
745 movh m3, [r2+r3*1+1] | |
746 punpcklbw m0, m6 | |
747 punpcklbw m1, m6 | |
748 punpcklbw m2, m6 | |
749 punpcklbw m3, m6 | |
750 pmullw m0, m4 | |
751 pmullw m1, m5 | |
752 pmullw m2, m4 | |
753 pmullw m3, m5 | |
754 paddsw m0, m1 | |
755 paddsw m2, m3 | |
756 psraw m0, 2 | |
757 psraw m2, 2 | |
758 pavgw m0, m6 | |
759 pavgw m2, m6 | |
760 %ifidn %1, mmxext | |
761 packuswb m0, m0 | |
762 packuswb m2, m2 | |
763 movh [r0+r1*0], m0 | |
764 movh [r0+r1*1], m2 | |
765 %else | |
766 packuswb m0, m2 | |
767 movh [r0+r1*0], m0 | |
768 movhps [r0+r1*1], m0 | |
769 %endif | |
770 | |
771 lea r0, [r0+r1*2] | |
772 lea r2, [r2+r3*2] | |
773 sub r4, 2 | |
774 jg .nextrow | |
775 REP_RET | |
776 %endmacro | |
777 | |
778 INIT_MMX | |
779 FILTER_BILINEAR mmxext, 4, 0 | |
780 INIT_XMM | |
781 FILTER_BILINEAR sse2, 8, 7 | |
782 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
783 %macro FILTER_BILINEAR_SSSE3 1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
784 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
11991 | 785 shl r6d, 4 |
786 %ifdef PIC | |
787 lea r11, [bilinear_filter_vb_m] | |
788 %endif | |
789 pxor m4, m4 | |
12000 | 790 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 791 .nextrow |
792 movh m0, [r2+r3*0] | |
793 movh m1, [r2+r3*1] | |
794 movh m2, [r2+r3*2] | |
795 punpcklbw m0, m1 | |
796 punpcklbw m1, m2 | |
797 pmaddubsw m0, m3 | |
798 pmaddubsw m1, m3 | |
799 psraw m0, 2 | |
800 psraw m1, 2 | |
801 pavgw m0, m4 | |
802 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
803 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
804 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
805 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
806 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
807 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
808 %else |
11991 | 809 packuswb m0, m1 |
810 movh [r0+r1*0], m0 | |
811 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
812 %endif |
11991 | 813 |
814 lea r0, [r0+r1*2] | |
815 lea r2, [r2+r3*2] | |
816 sub r4, 2 | |
817 jg .nextrow | |
818 REP_RET | |
819 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
820 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
11991 | 821 shl r5d, 4 |
822 %ifdef PIC | |
823 lea r11, [bilinear_filter_vb_m] | |
824 %endif | |
825 pxor m4, m4 | |
826 mova m2, [filter_h2_shuf] | |
12000 | 827 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 828 .nextrow |
829 movu m0, [r2+r3*0] | |
830 movu m1, [r2+r3*1] | |
831 pshufb m0, m2 | |
832 pshufb m1, m2 | |
833 pmaddubsw m0, m3 | |
834 pmaddubsw m1, m3 | |
835 psraw m0, 2 | |
836 psraw m1, 2 | |
837 pavgw m0, m4 | |
838 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
839 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
840 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
841 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
842 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
843 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
844 %else |
11991 | 845 packuswb m0, m1 |
846 movh [r0+r1*0], m0 | |
847 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
848 %endif |
11991 | 849 |
850 lea r0, [r0+r1*2] | |
851 lea r2, [r2+r3*2] | |
852 sub r4, 2 | |
853 jg .nextrow | |
854 REP_RET | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
855 %endmacro |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
856 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
857 INIT_MMX |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
858 FILTER_BILINEAR_SSSE3 4 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
859 INIT_XMM |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
860 FILTER_BILINEAR_SSSE3 8 |
11991 | 861 |
11992 | 862 cglobal put_vp8_pixels8_mmx, 5,5 |
863 .nextrow: | |
864 movq mm0, [r2+r3*0] | |
865 movq mm1, [r2+r3*1] | |
866 lea r2, [r2+r3*2] | |
867 movq [r0+r1*0], mm0 | |
868 movq [r0+r1*1], mm1 | |
869 lea r0, [r0+r1*2] | |
870 sub r4d, 2 | |
871 jg .nextrow | |
872 REP_RET | |
873 | |
874 cglobal put_vp8_pixels16_mmx, 5,5 | |
875 .nextrow: | |
876 movq mm0, [r2+r3*0+0] | |
877 movq mm1, [r2+r3*0+8] | |
878 movq mm2, [r2+r3*1+0] | |
879 movq mm3, [r2+r3*1+8] | |
880 lea r2, [r2+r3*2] | |
881 movq [r0+r1*0+0], mm0 | |
882 movq [r0+r1*0+8], mm1 | |
883 movq [r0+r1*1+0], mm2 | |
884 movq [r0+r1*1+8], mm3 | |
885 lea r0, [r0+r1*2] | |
886 sub r4d, 2 | |
887 jg .nextrow | |
888 REP_RET | |
889 | |
890 cglobal put_vp8_pixels16_sse, 5,5,2 | |
891 .nextrow: | |
892 movups xmm0, [r2+r3*0] | |
893 movups xmm1, [r2+r3*1] | |
894 lea r2, [r2+r3*2] | |
895 movaps [r0+r1*0], xmm0 | |
896 movaps [r0+r1*1], xmm1 | |
897 lea r0, [r0+r1*2] | |
898 sub r4d, 2 | |
899 jg .nextrow | |
900 REP_RET | |
901 | |
11975 | 902 ;----------------------------------------------------------------------------- |
903 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
904 ;----------------------------------------------------------------------------- | |
905 | |
12238 | 906 %macro ADD_DC 4 |
907 %4 m2, [r0+%3] | |
908 %4 m3, [r0+r2+%3] | |
909 %4 m4, [r1+%3] | |
910 %4 m5, [r1+r2+%3] | |
911 paddusb m2, %1 | |
912 paddusb m3, %1 | |
913 paddusb m4, %1 | |
914 paddusb m5, %1 | |
915 psubusb m2, %2 | |
916 psubusb m3, %2 | |
917 psubusb m4, %2 | |
918 psubusb m5, %2 | |
919 %4 [r0+%3], m2 | |
920 %4 [r0+r2+%3], m3 | |
921 %4 [r1+%3], m4 | |
922 %4 [r1+r2+%3], m5 | |
923 %endmacro | |
924 | |
925 INIT_MMX | |
11975 | 926 cglobal vp8_idct_dc_add_mmx, 3, 3 |
927 ; load data | |
12238 | 928 movd m0, [r1] |
11975 | 929 |
930 ; calculate DC | |
12238 | 931 paddw m0, [pw_4] |
932 pxor m1, m1 | |
933 psraw m0, 3 | |
934 movd [r1], m1 | |
935 psubw m1, m0 | |
936 packuswb m0, m0 | |
937 packuswb m1, m1 | |
938 punpcklbw m0, m0 | |
939 punpcklbw m1, m1 | |
940 punpcklwd m0, m0 | |
941 punpcklwd m1, m1 | |
11975 | 942 |
943 ; add DC | |
12238 | 944 lea r1, [r0+r2*2] |
945 ADD_DC m0, m1, 0, movh | |
11975 | 946 RET |
947 | |
12238 | 948 INIT_XMM |
11975 | 949 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
950 ; load data | |
12238 | 951 movd m0, [r1] |
952 pxor m1, m1 | |
953 | |
954 ; calculate DC | |
955 paddw m0, [pw_4] | |
956 movd [r1], m1 | |
957 lea r1, [r0+r2*2] | |
958 movd m2, [r0] | |
959 movd m3, [r0+r2] | |
960 movd m4, [r1] | |
961 movd m5, [r1+r2] | |
962 psraw m0, 3 | |
963 pshuflw m0, m0, 0 | |
964 punpcklqdq m0, m0 | |
965 punpckldq m2, m3 | |
966 punpckldq m4, m5 | |
967 punpcklbw m2, m1 | |
968 punpcklbw m4, m1 | |
969 paddw m2, m0 | |
970 paddw m4, m0 | |
971 packuswb m2, m4 | |
972 movd [r0], m2 | |
973 pextrd [r0+r2], m2, 1 | |
974 pextrd [r1], m2, 2 | |
975 pextrd [r1+r2], m2, 3 | |
976 RET | |
977 | |
978 ;----------------------------------------------------------------------------- | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
12238 | 980 ;----------------------------------------------------------------------------- |
981 | |
982 INIT_MMX | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
983 cglobal vp8_idct_dc_add4y_mmx, 3, 3 |
12238 | 984 ; load data |
985 movd m0, [r1+32*0] ; A | |
986 movd m1, [r1+32*2] ; C | |
987 punpcklwd m0, [r1+32*1] ; A B | |
988 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 989 punpckldq m0, m1 ; A B C D |
12238 | 990 pxor m6, m6 |
11975 | 991 |
992 ; calculate DC | |
12238 | 993 paddw m0, [pw_4] |
994 movd [r1+32*0], m6 | |
995 movd [r1+32*1], m6 | |
996 movd [r1+32*2], m6 | |
997 movd [r1+32*3], m6 | |
998 psraw m0, 3 | |
999 psubw m6, m0 | |
1000 packuswb m0, m0 | |
1001 packuswb m6, m6 | |
1002 punpcklbw m0, m0 ; AABBCCDD | |
1003 punpcklbw m6, m6 ; AABBCCDD | |
1004 movq m1, m0 | |
1005 movq m7, m6 | |
1006 punpcklbw m0, m0 ; AAAABBBB | |
1007 punpckhbw m1, m1 ; CCCCDDDD | |
1008 punpcklbw m6, m6 ; AAAABBBB | |
1009 punpckhbw m7, m7 ; CCCCDDDD | |
1010 | |
1011 ; add DC | |
1012 lea r1, [r0+r2*2] | |
1013 ADD_DC m0, m6, 0, mova | |
1014 ADD_DC m1, m7, 8, mova | |
1015 RET | |
1016 | |
1017 INIT_XMM | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1018 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |
12238 | 1019 ; load data |
1020 movd m0, [r1+32*0] ; A | |
1021 movd m1, [r1+32*2] ; C | |
1022 punpcklwd m0, [r1+32*1] ; A B | |
1023 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 1024 punpckldq m0, m1 ; A B C D |
12238 | 1025 pxor m1, m1 |
1026 | |
1027 ; calculate DC | |
1028 paddw m0, [pw_4] | |
1029 movd [r1+32*0], m1 | |
1030 movd [r1+32*1], m1 | |
1031 movd [r1+32*2], m1 | |
1032 movd [r1+32*3], m1 | |
1033 psraw m0, 3 | |
1034 psubw m1, m0 | |
1035 packuswb m0, m0 | |
1036 packuswb m1, m1 | |
1037 punpcklbw m0, m0 | |
1038 punpcklbw m1, m1 | |
1039 punpcklbw m0, m0 | |
1040 punpcklbw m1, m1 | |
1041 | |
1042 ; add DC | |
1043 lea r1, [r0+r2*2] | |
1044 ADD_DC m0, m1, 0, mova | |
11975 | 1045 RET |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1046 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1047 ;----------------------------------------------------------------------------- |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1048 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1049 ;----------------------------------------------------------------------------- |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1050 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1051 INIT_MMX |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1052 cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1053 ; load data |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1054 movd m0, [r1+32*0] ; A |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1055 movd m1, [r1+32*2] ; C |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1056 punpcklwd m0, [r1+32*1] ; A B |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1057 punpcklwd m1, [r1+32*3] ; C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1058 punpckldq m0, m1 ; A B C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1059 pxor m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1060 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1061 ; calculate DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1062 paddw m0, [pw_4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1063 movd [r1+32*0], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1064 movd [r1+32*1], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1065 movd [r1+32*2], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1066 movd [r1+32*3], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1067 psraw m0, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1068 psubw m6, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1069 packuswb m0, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1070 packuswb m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1071 punpcklbw m0, m0 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1072 punpcklbw m6, m6 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1073 movq m1, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1074 movq m7, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1075 punpcklbw m0, m0 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1076 punpckhbw m1, m1 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1077 punpcklbw m6, m6 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1078 punpckhbw m7, m7 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1079 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1080 ; add DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1081 lea r1, [r0+r2*2] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1082 ADD_DC m0, m6, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1083 lea r0, [r0+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1084 lea r1, [r1+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1085 ADD_DC m1, m7, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1086 RET |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1087 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1088 ;----------------------------------------------------------------------------- |
12013 | 1089 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
1090 ;----------------------------------------------------------------------------- | |
1091 | |
1092 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
1093 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
1094 %macro VP8_MULTIPLY_SUMSUB 4 | |
1095 mova %3, %1 | |
1096 mova %4, %2 | |
1097 pmulhw %3, m6 ;20091(1) | |
1098 pmulhw %4, m6 ;20091(2) | |
1099 paddw %3, %1 | |
1100 paddw %4, %2 | |
12018 | 1101 paddw %1, %1 |
1102 paddw %2, %2 | |
12013 | 1103 pmulhw %1, m7 ;35468(1) |
1104 pmulhw %2, m7 ;35468(2) | |
1105 psubw %1, %4 | |
1106 paddw %2, %3 | |
1107 %endmacro | |
1108 | |
1109 ; calculate x0=%1+%3; x1=%1-%3 | |
1110 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
1111 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
1112 ; %5/%6 are temporary registers | |
1113 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
1114 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
1115 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
1116 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
1117 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
1118 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
1119 SWAP %4, %1 | |
1120 SWAP %4, %3 | |
1121 %endmacro | |
1122 | |
1123 INIT_MMX | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1124 %macro VP8_IDCT_ADD 1 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1125 cglobal vp8_idct_add_%1, 3, 3 |
12013 | 1126 ; load block data |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1127 movq m0, [r1+ 0] |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1128 movq m1, [r1+ 8] |
12013 | 1129 movq m2, [r1+16] |
1130 movq m3, [r1+24] | |
1131 movq m6, [pw_20091] | |
1132 movq m7, [pw_17734] | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1133 %ifidn %1, sse |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1134 xorps xmm0, xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1135 movaps [r1+ 0], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1136 movaps [r1+16], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1137 %else |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1138 pxor m4, m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1139 movq [r1+ 0], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1140 movq [r1+ 8], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1141 movq [r1+16], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1142 movq [r1+24], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1143 %endif |
12013 | 1144 |
1145 ; actual IDCT | |
1146 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1147 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1148 paddw m0, [pw_4] | |
1149 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1150 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1151 | |
1152 ; store | |
1153 pxor m4, m4 | |
1154 lea r1, [r0+2*r2] | |
1155 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1156 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1157 | |
1158 RET | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1159 %endmacro |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1160 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1161 VP8_IDCT_ADD mmx |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1162 VP8_IDCT_ADD sse |
12013 | 1163 |
1164 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1165 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1166 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1167 |
12209 | 1168 %macro SCATTER_WHT 3 |
1169 movd r1d, m%1 | |
1170 movd r2d, m%2 | |
1171 mov [r0+2*16*(0+%3)], r1w | |
1172 mov [r0+2*16*(1+%3)], r2w | |
1173 shr r1d, 16 | |
1174 shr r2d, 16 | |
1175 psrlq m%1, 32 | |
1176 psrlq m%2, 32 | |
1177 mov [r0+2*16*(4+%3)], r1w | |
1178 mov [r0+2*16*(5+%3)], r2w | |
1179 movd r1d, m%1 | |
1180 movd r2d, m%2 | |
1181 mov [r0+2*16*(8+%3)], r1w | |
1182 mov [r0+2*16*(9+%3)], r2w | |
1183 shr r1d, 16 | |
1184 shr r2d, 16 | |
1185 mov [r0+2*16*(12+%3)], r1w | |
1186 mov [r0+2*16*(13+%3)], r2w | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1187 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1188 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1189 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1190 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1191 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1192 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1193 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1194 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1195 INIT_MMX |
12209 | 1196 cglobal vp8_luma_dc_wht_mmx, 2,3 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1197 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1198 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1199 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1200 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1201 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1202 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1203 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1204 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1205 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1206 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1207 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1208 psraw m3, 3 |
12209 | 1209 SCATTER_WHT 0, 1, 0 |
1210 SCATTER_WHT 2, 3, 2 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1211 RET |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1212 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1213 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1214 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1215 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1216 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1217 ; macro called with 7 mm register indexes as argument, and 4 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1218 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1219 ; first 4 mm registers will carry the transposed pixel data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1220 ; the other three are scratchspace (one would be sufficient, but this allows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1221 ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1222 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1223 ; first two regular registers are buf+4*stride and buf+5*stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1224 ; third is -stride, fourth is +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1225 %macro READ_8x4_INTERLEAVED 11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1226 ; interleave 8 (A-H) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1227 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1228 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1229 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1230 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1231 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1232 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1233 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1234 punpcklbw m%1, m%5 ; A/B interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1235 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1236 punpcklbw m%2, m%6 ; C/D interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1237 punpcklbw m%3, m%7 ; E/F interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1238 punpcklbw m%4, m%5 ; G/H interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1239 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1240 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1241 ; macro called with 7 mm register indexes as argument, and 5 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1242 ; first 11 mean the same as READ_8x4_TRANSPOSED above |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1243 ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1244 ; will be set to second regular register + 8*stride at the end |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1245 %macro READ_16x4_INTERLEAVED 12 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1246 ; transpose 16 (A-P) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1247 lea %12, [r0+8*r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1248 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1249 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1250 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1251 movd m%3, [%12+%10*4] ; I0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1252 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1253 movd m%4, [%12+%10*2] ; K0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1254 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1255 movd m%5, [%12+%10] ; L0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1256 movd m%7, [%12] ; M0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1257 add %12, %11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1258 punpcklbw m%1, m%3 ; A/I |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1259 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1260 punpcklbw m%2, m%4 ; C/K |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1261 punpcklbw m%6, m%5 ; D/L |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1262 punpcklbw m%3, m%7 ; E/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1263 punpcklbw m%2, m%6 ; C/D/K/L interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1264 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1265 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1266 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1267 movd m%4, [%12+%10*4] ; J0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1268 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1269 movd m%6, [%12] ; N0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1270 punpcklbw m%5, m%4 ; B/J |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1271 punpcklbw m%7, m%6 ; F/N |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1272 punpcklbw m%1, m%5 ; A/B/I/J interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1273 punpcklbw m%3, m%7 ; E/F/M/N interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1274 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1275 movd m%6, [%12+%11] ; O0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1276 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1277 movd m%7, [%12+%11*2] ; P0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1278 punpcklbw m%4, m%6 ; G/O |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1279 punpcklbw m%5, m%7 ; H/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1280 punpcklbw m%4, m%5 ; G/H/O/P interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1281 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1282 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1283 ; write 4 mm registers of 2 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1284 ; first four arguments are mm register indexes containing source data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1285 ; last four are registers containing buf+4*stride, buf+5*stride, |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1286 ; -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1287 %macro WRITE_4x2D 8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1288 ; write out (2 dwords per register) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1289 movd [%5+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1290 movd [%5+%7*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1291 movd [%5], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1292 movd [%6+%8], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1293 punpckhdq m%1, m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1294 punpckhdq m%2, m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1295 punpckhdq m%3, m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1296 punpckhdq m%4, m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1297 movd [%6+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1298 movd [%5+%7], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1299 movd [%6], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1300 movd [%6+%8*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1301 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1302 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1303 ; write 4 xmm registers of 4 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1304 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1305 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1306 ; we add 1*stride to the third regular registry in the process |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1307 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1308 ; same memory region), or 8 if they cover two separate buffers (third one points to |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1309 ; a different memory region than the first two), allowing for more optimal code for |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1310 ; the 16-width case |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1311 %macro WRITE_4x4D 10 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1312 ; write out (4 dwords per register), start with dwords zero |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1313 movd [%5+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1314 movd [%5], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1315 movd [%7+%8*4], m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1316 movd [%7], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1317 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1318 ; store dwords 1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1319 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1320 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1321 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1322 psrldq m%4, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1323 movd [%6+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1324 movd [%6], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1325 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1326 movd [%6+%9*4], m%3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1327 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1328 movd [%7+%9], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1329 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1330 ; write dwords 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1331 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1332 psrldq m%2, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1333 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1334 movd [%5+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1335 movd %5, m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1336 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1337 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1338 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1339 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1340 movd [%5+%8*2], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1341 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1342 movd [%6+%9], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1343 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1344 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1345 add %7, %9 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1346 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1347 ; store dwords 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1348 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1349 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1350 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1351 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1352 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1353 mov [%7+%8*4], %5d |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1354 movd [%6+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1355 %else |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1356 movd [%5+%8], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1357 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1358 movd [%6+%9*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1359 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1360 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1361 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1362 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1363 %macro SPLATB_REG_MMX 2-3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1364 movd %1, %2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1365 punpcklbw %1, %1 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1366 punpcklwd %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1367 punpckldq %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1368 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1369 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1370 %macro SPLATB_REG_MMXEXT 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1371 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1372 punpcklbw %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1373 pshufw %1, %1, 0x0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1374 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1375 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1376 %macro SPLATB_REG_SSE2 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1377 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1378 punpcklbw %1, %1 |
12210 | 1379 pshuflw %1, %1, 0x0 |
1380 punpcklqdq %1, %1 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1381 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1382 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1383 %macro SPLATB_REG_SSSE3 3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1384 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1385 pshufb %1, %3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1386 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1387 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1388 %macro SIMPLE_LOOPFILTER 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1389 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1390 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1391 mov r5, rsp ; backup stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1392 and rsp, ~(mmsize-1) ; align stack |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1393 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1394 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1395 mov r3, 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1396 %endif |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1397 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1398 %if mmsize == 16 |
12210 | 1399 pxor m0, m0 |
1400 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1401 %endif |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1402 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1403 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1404 ; set up indexes to address 4 rows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1405 mov r2, r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1406 neg r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1407 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1408 lea r0, [r0+4*r2-2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1409 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1410 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1411 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1412 %if mmsize == 8 ; mmx / mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1413 .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1414 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1415 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1416 ; read 4 half/full rows of pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1417 mova m0, [r0+r1*2] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1418 mova m1, [r0+r1] ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1419 mova m2, [r0] ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1420 mova m3, [r0+r2] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1421 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1422 lea r4, [r0+r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1423 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1424 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1425 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1426 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1427 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1428 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1429 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1430 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1431 mova [rsp], m0 ; store p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1432 mova [rsp+mmsize], m3 ; store q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1433 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1434 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1435 ; simple_limit |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1436 mova m5, m2 ; m5=backup of q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1437 mova m6, m1 ; m6=backup of p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1438 psubusb m1, m2 ; p0-q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1439 psubusb m2, m6 ; q0-p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1440 por m1, m2 ; FFABS(p0-q0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1441 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1442 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1443 mova m4, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1444 mova m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1445 psubusb m3, m0 ; q1-p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1446 psubusb m0, m4 ; p1-q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1447 por m3, m0 ; FFABS(p1-q1) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1448 mova m0, [pb_80] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1449 pxor m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1450 pxor m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1451 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1452 pand m3, [pb_FE] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1453 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1454 paddusb m3, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1455 psubusb m3, m7 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1456 pxor m1, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1457 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1458 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1459 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1460 mova m4, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1461 pxor m5, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1462 pxor m0, m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1463 psubsb m5, m0 ; q0-p0 (signed) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1464 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1465 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1466 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1467 pand m2, m3 ; apply filter mask (m3) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1468 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1469 mova m3, [pb_F8] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1470 mova m1, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1471 paddsb m2, [pb_4] ; f1<<3=a+4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1472 paddsb m1, [pb_3] ; f2<<3=a+3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1473 pand m2, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1474 pand m1, m3 ; cache f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1475 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1476 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1477 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1478 pcmpgtb m0, m2 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1479 psubb m3, m2 ; -f1<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1480 psrlq m2, 3 ; +f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1481 psrlq m3, 3 ; -f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1482 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1483 pandn m0, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1484 psubusb m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1485 paddusb m4, m3 ; q0-f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1486 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1487 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1488 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1489 pcmpgtb m0, m1 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1490 psubb m3, m1 ; -f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1491 psrlq m1, 3 ; +f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1492 psrlq m3, 3 ; -f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1493 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1494 pandn m0, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1495 paddusb m6, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1496 psubusb m6, m3 ; p0+f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1497 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1498 ; store |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1499 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1500 mova [r0], m4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1501 mova [r0+r1], m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1502 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1503 mova m0, [rsp] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1504 SWAP 2, 4 ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1505 SWAP 1, 6 ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1506 mova m3, [rsp+mmsize] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1507 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1508 TRANSPOSE4x4B 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1509 %if mmsize == 16 ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1510 add r3, r1 ; change from r4*8*stride to r0+8*stride |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1511 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1512 %else ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1513 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1514 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1515 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1516 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1517 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1518 ; next 8 pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1519 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1520 add r0, 8 ; advance 8 cols = pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1521 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1522 lea r0, [r0+r2*8] ; advance 8 rows = lines |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1523 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1524 dec r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1525 jg .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1526 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1527 REP_RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1528 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1529 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1530 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1531 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1532 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1533 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1534 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1535 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1536 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1537 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1538 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1539 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1540 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1541 %define SPLATB_REG SPLATB_REG_MMX |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1542 SIMPLE_LOOPFILTER mmx, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1543 SIMPLE_LOOPFILTER mmx, h, 6 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1544 %define SPLATB_REG SPLATB_REG_MMXEXT |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1545 SIMPLE_LOOPFILTER mmxext, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1546 SIMPLE_LOOPFILTER mmxext, h, 6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1547 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1548 %define SPLATB_REG SPLATB_REG_SSE2 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1549 SIMPLE_LOOPFILTER sse2, v, 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1550 SIMPLE_LOOPFILTER sse2, h, 6 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1551 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 1552 SIMPLE_LOOPFILTER ssse3, v, 3 |
1553 SIMPLE_LOOPFILTER ssse3, h, 6 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1554 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1555 ;----------------------------------------------------------------------------- |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1556 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1557 ; int flimE, int flimI, int hev_thr); |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1558 ;----------------------------------------------------------------------------- |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1559 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1560 %macro INNER_LOOPFILTER 5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1561 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1562 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1563 %define dst8_reg r1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1564 %define mstride_reg r2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1565 %define E_reg r3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1566 %define I_reg r4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1567 %define hev_thr_reg r5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1568 %else ; luma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1569 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1570 %define mstride_reg r1 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1571 %define E_reg r2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1572 %define I_reg r3 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1573 %define hev_thr_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1574 %ifdef m8 ; x86-64, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1575 %define dst8_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1576 %elif mmsize == 16 ; x86-32, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1577 %define dst8_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1578 %else ; x86-32, mmx/mmxext |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1579 %define cnt_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1580 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1581 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1582 %define dst_reg r0 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1583 %define stride_reg E_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1584 %define dst2_reg I_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1585 %ifndef m8 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1586 %define stack_reg hev_thr_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1587 %endif |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1588 |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1589 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1590 %if mmsize == 16 |
12210 | 1591 pxor m7, m7 |
1592 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1593 %endif |
12210 | 1594 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1595 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1596 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1597 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1598 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1599 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1600 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1601 ; align stack |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1602 mov stack_reg, rsp ; backup stack pointer |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1603 and rsp, ~(mmsize-1) ; align stack |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1604 %ifidn %2, v |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1605 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1606 ; [3]=hev() result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1607 %else ; h |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1608 sub rsp, mmsize * 5 ; extra storage space for transposes |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1609 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1610 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1611 %define flim_E [rsp] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1612 %define flim_I [rsp+mmsize] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1613 %define hev_thr [rsp+mmsize*2] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1614 %define mask_res [rsp+mmsize*3] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1615 %define p0backup [rsp+mmsize*3] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1616 %define q0backup [rsp+mmsize*4] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1617 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1618 mova flim_E, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1619 mova flim_I, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1620 mova hev_thr, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1621 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1622 %else ; sse2 on x86-64 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1623 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1624 %define flim_E m9 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1625 %define flim_I m10 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1626 %define hev_thr m11 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1627 %define mask_res m12 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1628 %define p0backup m12 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1629 %define q0backup m8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1630 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1631 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1632 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1633 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1634 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1635 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1636 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1637 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1638 mov cnt_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1639 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1640 mov stride_reg, mstride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1641 neg mstride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1642 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1643 lea dst_reg, [dst_reg + stride_reg*4-4] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1644 %if %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1645 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1646 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1647 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1648 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1649 %if mmsize == 8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1650 .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1651 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1652 ; read |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1653 lea dst2_reg, [dst_reg + stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1654 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1655 %if %4 == 8 && mmsize == 16 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1656 %define movrow movh |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1657 %else |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1658 %define movrow mova |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1659 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1660 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1661 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1662 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1663 movrow m5, [dst2_reg] ; q1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1664 movrow m6, [dst2_reg+ stride_reg] ; q2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1665 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1666 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1667 movhps m0, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1668 movhps m2, [dst8_reg+mstride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1669 add dst8_reg, stride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1670 movhps m1, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1671 movhps m5, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1672 movhps m6, [dst8_reg+ stride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1673 movhps m7, [dst8_reg+ stride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1674 add dst8_reg, mstride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1675 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1676 %elif mmsize == 8 ; mmx/mmxext (h) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1677 ; read 8 rows of 8px each |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1678 movu m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1679 movu m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1680 movu m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1681 movu m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1682 movu m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1683 movu m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1684 movu m6, [dst2_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1685 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1686 ; 8x8 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1687 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1688 mova q0backup, m1 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1689 movu m7, [dst2_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1690 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1691 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1692 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1693 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1694 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1695 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1696 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1697 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1698 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1699 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1700 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1701 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1702 %else ; sse2 (h) |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1703 %if %4 == 16 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1704 lea dst8_reg, [dst_reg + stride_reg*8] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1705 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1706 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1707 ; read 16 rows of 8px each, interleave |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1708 movh m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1709 movh m1, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1710 movh m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1711 movh m5, [dst8_reg+mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1712 movh m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1713 movh m6, [dst8_reg+mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1714 movh m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1715 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1716 punpcklbw m0, m1 ; A/I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1717 punpcklbw m2, m5 ; C/K |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1718 punpcklbw m3, m6 ; D/L |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1719 punpcklbw m4, m7 ; E/M |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1720 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1721 add dst8_reg, stride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1722 movh m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1723 movh m6, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1724 movh m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1725 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1726 punpcklbw m1, m6 ; B/J |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1727 punpcklbw m5, m7 ; F/N |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1728 movh m6, [dst2_reg+ stride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1729 movh m7, [dst8_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1730 punpcklbw m6, m7 ; G/O |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1731 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1732 ; 8x16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1733 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1734 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1735 SWAP 1, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1736 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1737 mova q0backup, m1 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1738 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1739 movh m7, [dst2_reg+ stride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1740 movh m1, [dst8_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1741 punpcklbw m7, m1 ; H/P |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1742 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1743 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1744 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1745 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1746 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1747 SWAP 1, 8 |
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1748 SWAP 2, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1749 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1750 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1751 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1752 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1753 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1754 %ifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1755 SWAP 5, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1756 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1757 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1758 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1759 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1760 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1761 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1762 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1763 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1764 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1765 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1766 mova m4, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1767 SWAP 4, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1768 psubusb m4, m0 ; p2-p3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1769 psubusb m0, m1 ; p3-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1770 por m0, m4 ; abs(p3-p2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1771 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1772 mova m4, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1773 SWAP 4, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1774 psubusb m4, m1 ; p1-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1775 psubusb m1, m2 ; p2-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1776 por m1, m4 ; abs(p2-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1777 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1778 mova m4, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1779 SWAP 4, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1780 psubusb m4, m7 ; q2-q3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1781 psubusb m7, m6 ; q3-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1782 por m7, m4 ; abs(q3-q2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1783 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1784 mova m4, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1785 SWAP 4, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1786 psubusb m4, m6 ; q1-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1787 psubusb m6, m5 ; q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1788 por m6, m4 ; abs(q2-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1789 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1790 %ifidn %1, mmx |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1791 mova m4, flim_I |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1792 pxor m3, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1793 psubusb m0, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1794 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1795 psubusb m7, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1796 psubusb m6, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1797 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1798 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1799 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1800 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1801 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1802 pand m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1803 pand m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1804 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1805 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1806 pmaxub m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1807 pmaxub m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1808 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1809 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1810 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1811 SWAP 7, 3 ; now m7 is zero |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1812 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1813 movrow m3, [dst_reg +mstride_reg] ; p0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1814 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1815 movhps m3, [dst8_reg+mstride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1816 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1817 %elifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1818 SWAP 3, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1819 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1820 mova m3, p0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1821 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1822 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1823 mova m1, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1824 SWAP 1, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1825 mova m6, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1826 SWAP 3, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1827 psubusb m1, m3 ; p1-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1828 psubusb m6, m2 ; p0-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1829 por m1, m6 ; abs(p1-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1830 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1831 mova m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1832 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1833 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1834 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1835 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1836 pand m0, m1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1837 mova mask_res, m6 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1838 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1839 pmaxub m0, m1 ; max_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1840 SWAP 1, 4 ; max_hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1841 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1842 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1843 SWAP 6, 4 ; now m6 is I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1844 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1845 movrow m4, [dst_reg] ; q0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1846 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1847 movhps m4, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1848 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1849 %elifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1850 SWAP 4, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1851 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1852 mova m4, q0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1853 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1854 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1855 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1856 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1857 SWAP 7, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1858 psubusb m1, m5 ; q0-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1859 psubusb m7, m4 ; q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1860 por m1, m7 ; abs(q1-q0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1861 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1862 mova m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1863 psubusb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1864 psubusb m7, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1865 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1866 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1867 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1868 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1869 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1870 pand m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1871 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1872 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1873 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1874 pmaxub m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1875 psubusb m0, flim_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1876 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1877 pcmpeqb m0, m7 ; max(abs(..)) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1878 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1879 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1880 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1881 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1882 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1883 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1884 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1885 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1886 ; simple_limit |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1887 mova m1, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1888 SWAP 1, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1889 mova m6, m4 ; keep copies of p0/q0 around for later use |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1890 SWAP 6, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1891 psubusb m1, m4 ; p0-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1892 psubusb m6, m3 ; q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1893 por m1, m6 ; abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1894 paddusb m1, m1 ; m1=2*abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1895 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1896 mova m7, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1897 SWAP 7, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1898 mova m6, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1899 SWAP 6, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1900 psubusb m7, m5 ; p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1901 psubusb m6, m2 ; q1-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1902 por m7, m6 ; abs(q1-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1903 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1904 pand m7, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1905 psrlq m7, 1 ; abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1906 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1907 psubusb m7, flim_E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1908 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1909 pand m0, m7 ; normal_limit result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1910 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1911 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1912 %ifdef m8 ; x86-64 && sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1913 mova m8, [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1914 %define pb_80_var m8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1915 %else ; x86-32 or mmx/mmxext |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1916 %define pb_80_var [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1917 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1918 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1919 mova m7, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1920 pxor m1, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1921 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1922 psubsb m1, m7 ; (signed) q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1923 mova m6, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1924 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1925 pxor m6, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1926 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1927 psubsb m6, m7 ; (signed) p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1928 mova m7, mask_res |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1929 pandn m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1930 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1931 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1932 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1933 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1934 pand m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1935 mova m1, [pb_F8] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1936 mova m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1937 paddsb m7, [pb_3] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1938 paddsb m6, [pb_4] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1939 pand m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1940 pand m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1941 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1942 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1943 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1944 pcmpgtb m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1945 psubb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1946 psrlq m7, 3 ; +f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1947 psrlq m0, 3 ; -f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1948 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1949 pandn m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1950 psubusb m3, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1951 paddusb m3, m1 ; p0+f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1952 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1953 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1954 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1955 pcmpgtb m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1956 psubb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1957 psrlq m6, 3 ; +f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1958 psrlq m1, 3 ; -f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1959 pand m1, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1960 pandn m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1961 psubusb m4, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1962 paddusb m4, m1 ; q0-f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1963 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1964 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1965 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1966 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1967 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1968 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1969 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1970 mova m7, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1971 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1972 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1973 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1974 pand m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1975 pand m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1976 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1977 paddusb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1978 pand m1, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1979 pandn m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1980 psrlq m1, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1981 psrlq m7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1982 SWAP 0, 7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1983 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1984 psubusb m1, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1985 pavgb m0, m7 ; a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1986 pavgb m1, m7 ; -a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1987 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1988 psubusb m5, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1989 psubusb m2, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1990 paddusb m5, m1 ; q1-a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1991 paddusb m2, m0 ; p1+a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1992 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1993 ; store |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1994 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1995 movrow [dst_reg +mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1996 movrow [dst_reg +mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1997 movrow [dst_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1998 movrow [dst_reg + stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1999 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2000 movhps [dst8_reg+mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2001 movhps [dst8_reg+mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2002 movhps [dst8_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2003 movhps [dst8_reg+ stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2004 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2005 %else ; h |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2006 add dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2007 add dst2_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2008 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2009 ; 4x8/16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2010 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2011 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2012 %if mmsize == 8 ; mmx/mmxext (h) |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2013 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2014 %else ; sse2 (h) |
12180 | 2015 lea dst8_reg, [dst8_reg+mstride_reg+2] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2016 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2017 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2018 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2019 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2020 %if mmsize == 8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2021 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2022 %ifidn %2, h |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2023 sub dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2024 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2025 cmp dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2026 mov dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2027 jnz .next8px |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2028 %else |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2029 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2030 lea dst_reg, [dst_reg + stride_reg*8-2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2031 %else ; v |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2032 add dst_reg, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2033 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2034 dec cnt_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2035 jg .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2036 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2037 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2038 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2039 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2040 mov rsp, stack_reg ; restore stack pointer |
12173
c47ddb7df424
Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents:
12168
diff
changeset
|
2041 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2042 RET |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2043 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2044 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2045 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2046 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2047 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
2048 INNER_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2049 INNER_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2050 INNER_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2051 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2052 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2053 INNER_LOOPFILTER mmxext, v, 6, 16, 0 |
2054 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | |
2055 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | |
2056 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2057 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2058 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2059 %define SPLATB_REG SPLATB_REG_SSE2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2060 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2061 %ifdef m8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2062 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2063 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2064 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2065 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2066 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2067 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2068 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2069 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 2070 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
2071 %ifdef m8 | |
2072 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | |
2073 %else | |
2074 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | |
2075 %endif | |
2076 INNER_LOOPFILTER ssse3, v, 6, 8, 13 | |
2077 INNER_LOOPFILTER ssse3, h, 6, 8, 13 | |
2078 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2079 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2080 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2081 ; int flimE, int flimI, int hev_thr); |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2082 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2083 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2084 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2085 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2086 ; for pre-SSE4: |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2087 ; 3 is a general-purpose register that we will clobber |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2088 ; for SSE4: |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2089 ; 3 is a pointer to the destination's 5th line |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2090 ; 4 is a pointer to the destination's 4th line |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2091 ; 5/6 is -stride and +stride |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2092 %macro WRITE_2x4W 6 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2093 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2094 punpckhdq %1, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2095 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2096 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2097 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2098 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2099 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2100 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2101 add %4, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2102 mov [%4+%5*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2103 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2104 mov [%4+%5 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2105 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2106 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2107 punpckhdq %2, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2108 mov [%4 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2109 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2110 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2111 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2112 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2113 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2114 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2115 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2116 mov [%4+%6*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2117 add %4, %5 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2118 %endmacro |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2119 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2120 %macro WRITE_8W_SSE2 5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2121 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2122 psrldq %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2123 mov [%3+%4*4], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2124 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2125 add %3, %5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2126 mov [%3+%4*4], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2127 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2128 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2129 psrldq %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2130 add %3, %4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2131 mov [%3+%4*2], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2132 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2133 mov [%3+%4 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2134 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2135 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2136 psrldq %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2137 mov [%3 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2138 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2139 mov [%3+%5 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2140 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2141 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2142 add %3, %5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2143 mov [%3+%5 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2144 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2145 mov [%3+%5*2], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2146 %endmacro |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2147 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2148 %macro WRITE_8W_SSE4 5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2149 pextrw [%3+%4*4], %1, 0 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2150 pextrw [%2+%4*4], %1, 1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2151 pextrw [%3+%4*2], %1, 2 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2152 pextrw [%3+%4 ], %1, 3 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2153 pextrw [%3 ], %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2154 pextrw [%2 ], %1, 5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2155 pextrw [%2+%5 ], %1, 6 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2156 pextrw [%2+%5*2], %1, 7 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2157 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2158 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2159 %macro MBEDGE_LOOPFILTER 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2160 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2161 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2162 %define dst8_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2163 %define mstride_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2164 %define E_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2165 %define I_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2166 %define hev_thr_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2167 %else ; luma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2168 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2169 %define mstride_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2170 %define E_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2171 %define I_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2172 %define hev_thr_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2173 %ifdef m8 ; x86-64, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2174 %define dst8_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2175 %elif mmsize == 16 ; x86-32, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2176 %define dst8_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2177 %else ; x86-32, mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2178 %define cnt_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2179 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2180 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2181 %define dst_reg r0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2182 %define stride_reg E_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2183 %define dst2_reg I_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2184 %ifndef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2185 %define stack_reg hev_thr_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2186 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2187 |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2188 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2189 %if mmsize == 16 |
12210 | 2190 pxor m7, m7 |
2191 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2192 %endif |
12210 | 2193 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2194 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2195 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2196 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2197 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2198 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2199 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2200 ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2201 mov stack_reg, rsp ; backup stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2202 and rsp, ~(mmsize-1) ; align stack |
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2203 %if mmsize == 16 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2204 sub rsp, mmsize * 7 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2205 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2206 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2207 ; [3]=hev() result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2208 ; [4]=filter tmp result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2209 ; [5]/[6] = p2/q2 backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2210 ; [7]=lim_res sign result |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2211 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2212 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2213 %define flim_E [rsp] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2214 %define flim_I [rsp+mmsize] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2215 %define hev_thr [rsp+mmsize*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2216 %define mask_res [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2217 %define lim_res [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2218 %define p0backup [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2219 %define q0backup [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2220 %define p2backup [rsp+mmsize*5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2221 %define q2backup [rsp+mmsize*6] |
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2222 %if mmsize == 16 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2223 %define lim_sign [rsp] |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2224 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2225 %define lim_sign [rsp+mmsize*7] |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2226 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2227 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2228 mova flim_E, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2229 mova flim_I, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2230 mova hev_thr, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2231 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2232 %else ; sse2 on x86-64 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2233 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2234 %define flim_E m9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2235 %define flim_I m10 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2236 %define hev_thr m11 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2237 %define mask_res m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2238 %define lim_res m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2239 %define p0backup m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2240 %define q0backup m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2241 %define p2backup m13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2242 %define q2backup m14 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2243 %define lim_sign m9 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2244 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2245 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2246 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2247 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2248 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2249 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2250 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2251 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2252 mov cnt_reg, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2253 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2254 mov stride_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2255 neg mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2256 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2257 lea dst_reg, [dst_reg + stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2258 %if %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2259 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2260 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2261 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2262 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2263 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2264 .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2265 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2266 ; read |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2267 lea dst2_reg, [dst_reg + stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2268 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2269 %if %4 == 8 && mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2270 %define movrow movh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2271 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2272 %define movrow mova |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2273 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2274 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2275 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2276 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2277 movrow m5, [dst2_reg] ; q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2278 movrow m6, [dst2_reg+ stride_reg] ; q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2279 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2280 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2281 movhps m0, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2282 movhps m2, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2283 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2284 movhps m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2285 movhps m5, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2286 movhps m6, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2287 movhps m7, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2288 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2289 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2290 %elif mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2291 ; read 8 rows of 8px each |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2292 movu m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2293 movu m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2294 movu m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2295 movu m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2296 movu m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2297 movu m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2298 movu m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2299 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2300 ; 8x8 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2301 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2302 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2303 movu m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2304 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2305 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2306 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2307 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2308 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2309 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2310 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2311 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2312 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2313 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2314 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2315 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2316 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2317 %if %4 == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2318 lea dst8_reg, [dst_reg + stride_reg*8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2319 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2320 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2321 ; read 16 rows of 8px each, interleave |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2322 movh m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2323 movh m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2324 movh m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2325 movh m5, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2326 movh m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2327 movh m6, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2328 movh m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2329 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2330 punpcklbw m0, m1 ; A/I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2331 punpcklbw m2, m5 ; C/K |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2332 punpcklbw m3, m6 ; D/L |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2333 punpcklbw m4, m7 ; E/M |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2334 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2335 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2336 movh m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2337 movh m6, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2338 movh m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2339 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2340 punpcklbw m1, m6 ; B/J |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2341 punpcklbw m5, m7 ; F/N |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2342 movh m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2343 movh m7, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2344 punpcklbw m6, m7 ; G/O |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2345 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2346 ; 8x16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2347 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2348 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2349 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2350 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2351 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2352 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2353 movh m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2354 movh m1, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2355 punpcklbw m7, m1 ; H/P |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2356 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2357 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2358 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2359 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2360 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2361 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2362 SWAP 2, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2363 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2364 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2365 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2366 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2367 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2368 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2369 SWAP 5, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2370 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2371 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2372 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2373 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2374 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2375 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2376 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2377 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2378 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2379 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2380 mova m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2381 SWAP 4, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2382 psubusb m4, m0 ; p2-p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2383 psubusb m0, m1 ; p3-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2384 por m0, m4 ; abs(p3-p2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2385 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2386 mova m4, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2387 SWAP 4, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2388 psubusb m4, m1 ; p1-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2389 mova p2backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2390 psubusb m1, m2 ; p2-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2391 por m1, m4 ; abs(p2-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2392 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2393 mova m4, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2394 SWAP 4, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2395 psubusb m4, m7 ; q2-q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2396 psubusb m7, m6 ; q3-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2397 por m7, m4 ; abs(q3-q2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2398 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2399 mova m4, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2400 SWAP 4, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2401 psubusb m4, m6 ; q1-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2402 mova q2backup, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2403 psubusb m6, m5 ; q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2404 por m6, m4 ; abs(q2-q1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2405 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2406 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2407 mova m4, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2408 pxor m3, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2409 psubusb m0, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2410 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2411 psubusb m7, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2412 psubusb m6, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2413 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2414 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2415 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2416 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2417 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2418 pand m7, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2419 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2420 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2421 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2422 pmaxub m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2423 pmaxub m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2424 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2425 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2426 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2427 SWAP 7, 3 ; now m7 is zero |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2428 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2429 movrow m3, [dst_reg +mstride_reg] ; p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2430 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2431 movhps m3, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2432 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2433 %elifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2434 SWAP 3, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2435 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2436 mova m3, p0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2437 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2438 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2439 mova m1, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2440 SWAP 1, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2441 mova m6, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2442 SWAP 3, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2443 psubusb m1, m3 ; p1-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2444 psubusb m6, m2 ; p0-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2445 por m1, m6 ; abs(p1-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2446 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2447 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2448 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2449 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2450 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2451 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2452 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2453 mova mask_res, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2454 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2455 pmaxub m0, m1 ; max_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2456 SWAP 1, 4 ; max_hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2457 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2458 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2459 SWAP 6, 4 ; now m6 is I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2460 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2461 movrow m4, [dst_reg] ; q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2462 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2463 movhps m4, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2464 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2465 %elifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2466 SWAP 4, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2467 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2468 mova m4, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2469 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2470 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2471 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2472 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2473 SWAP 7, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2474 psubusb m1, m5 ; q0-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2475 psubusb m7, m4 ; q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2476 por m1, m7 ; abs(q1-q0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2477 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2478 mova m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2479 psubusb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2480 psubusb m7, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2481 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2482 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2483 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2484 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2485 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2486 pand m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2487 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2488 pxor m7, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2489 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2490 pmaxub m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2491 psubusb m0, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2492 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2493 pcmpeqb m0, m7 ; max(abs(..)) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2494 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2495 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2496 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2497 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2498 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2499 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2500 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2501 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2502 ; simple_limit |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2503 mova m1, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2504 SWAP 1, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2505 mova m6, m4 ; keep copies of p0/q0 around for later use |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2506 SWAP 6, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2507 psubusb m1, m4 ; p0-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2508 psubusb m6, m3 ; q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2509 por m1, m6 ; abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2510 paddusb m1, m1 ; m1=2*abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2511 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2512 mova m7, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2513 SWAP 7, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2514 mova m6, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2515 SWAP 6, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2516 psubusb m7, m5 ; p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2517 psubusb m6, m2 ; q1-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2518 por m7, m6 ; abs(q1-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2519 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2520 pand m7, [pb_FE] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2521 psrlq m7, 1 ; abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2522 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2523 psubusb m7, flim_E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2524 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2525 pand m0, m7 ; normal_limit result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2526 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2527 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2528 %ifdef m8 ; x86-64 && sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2529 mova m8, [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2530 %define pb_80_var m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2531 %else ; x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2532 %define pb_80_var [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2533 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2534 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2535 mova m7, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2536 pxor m1, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2537 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2538 psubsb m1, m7 ; (signed) q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2539 mova m6, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2540 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2541 pxor m6, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2542 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2543 psubsb m6, m7 ; (signed) p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2544 mova m7, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2545 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2546 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2547 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2548 pand m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2549 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2550 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2551 pand lim_res, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2552 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2553 mova m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2554 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2555 mova lim_res, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2556 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2557 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2558 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2559 mova m1, [pb_F8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2560 mova m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2561 paddsb m7, [pb_3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2562 paddsb m6, [pb_4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2563 pand m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2564 pand m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2565 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2566 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2567 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2568 pcmpgtb m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2569 psubb m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2570 psrlq m7, 3 ; +f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2571 psrlq m0, 3 ; -f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2572 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2573 pandn m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2574 psubusb m3, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2575 paddusb m3, m1 ; p0+f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2576 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2577 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2578 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2579 pcmpgtb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2580 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2581 psrlq m6, 3 ; +f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2582 psrlq m1, 3 ; -f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2583 pand m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2584 pandn m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2585 psubusb m4, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2586 paddusb m4, m1 ; q0-f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2587 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2588 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2589 mova m7, [pw_63] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2590 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2591 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2592 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2593 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2594 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2595 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2596 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2597 pcmpgtb m0, m1 ; which are negative |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2598 punpcklbw m6, m0 ; signed byte->word |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2599 punpckhbw m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2600 mova lim_sign, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2601 mova mask_res, m6 ; backup for later in filter |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2602 mova lim_res, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2603 pmullw m6, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2604 pmullw m1, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2605 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2606 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2607 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2608 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2609 packsswb m6, m1 ; a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2610 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2611 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2612 pand m1, m0 ; -a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2613 pandn m0, m6 ; +a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2614 psubusb m3, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2615 paddusb m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2616 paddusb m3, m0 ; p0+a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2617 psubusb m4, m0 ; q0-a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2618 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2619 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2620 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2621 mova m0, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2622 pmullw m6, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2623 pmullw m1, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2624 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2625 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2626 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2627 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2628 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2629 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2630 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2631 pand m1, m0 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2632 pandn m0, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2633 psubusb m2, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2634 paddusb m5, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2635 paddusb m2, m0 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2636 psubusb m5, m0 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2637 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2638 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2639 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2640 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2641 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2642 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2643 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2644 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2645 pmullw m6, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2646 pmullw m1, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2647 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2648 paddw m1, m7 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2649 %ifdef m9 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2650 SWAP 7, 9 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2651 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2652 mova m7, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2653 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2654 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2655 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2656 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2657 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2658 psubb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2659 pand m0, m7 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2660 pandn m7, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2661 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2662 SWAP 1, 13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2663 SWAP 6, 14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2664 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2665 mova m1, p2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2666 mova m6, q2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2667 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2668 psubusb m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2669 paddusb m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2670 paddusb m1, m7 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2671 psubusb m6, m7 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2672 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2673 ; store |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2674 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2675 movrow [dst2_reg+mstride_reg*4], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2676 movrow [dst_reg +mstride_reg*2], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2677 movrow [dst_reg +mstride_reg ], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2678 movrow [dst_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2679 movrow [dst2_reg], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2680 movrow [dst2_reg+ stride_reg ], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2681 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2682 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2683 movhps [dst8_reg+mstride_reg*2], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2684 movhps [dst8_reg+mstride_reg ], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2685 movhps [dst8_reg], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2686 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2687 movhps [dst8_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2688 movhps [dst8_reg+ stride_reg ], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2689 movhps [dst8_reg+ stride_reg*2], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2690 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2691 %else ; h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2692 inc dst_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2693 inc dst2_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2694 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2695 ; 4x8/16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2696 TRANSPOSE4x4B 1, 2, 3, 4, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2697 SBUTTERFLY bw, 5, 6, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2698 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2699 %if mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2700 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2701 add dst_reg, 4 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2702 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2703 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2704 lea dst8_reg, [dst8_reg+mstride_reg+1] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2705 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12214
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2706 lea dst_reg, [dst2_reg+mstride_reg+4] |
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2707 lea dst8_reg, [dst8_reg+mstride_reg+4] |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2708 %ifidn %1, sse4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2709 add dst2_reg, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2710 %endif |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2711 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2712 %ifidn %1, sse4 |
12268
259988e7ad0f
Fix obvious bug in assignment. Somehow, the test vectors don't test this...
rbultje
parents:
12266
diff
changeset
|
2713 lea dst2_reg, [dst8_reg+ stride_reg] |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2714 %endif |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2715 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2716 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2717 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2718 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2719 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2720 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2721 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2722 sub dst_reg, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2723 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2724 cmp dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2725 mov dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2726 jnz .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2727 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2728 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2729 lea dst_reg, [dst_reg + stride_reg*8-5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2730 %else ; v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2731 add dst_reg, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2732 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2733 dec cnt_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2734 jg .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2735 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2736 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2737 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2738 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2739 mov rsp, stack_reg ; restore stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2740 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2741 RET |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2742 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2743 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2744 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2745 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2746 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
2747 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2748 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2749 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2750 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2751 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2752 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |
2753 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | |
2754 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | |
2755 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2756 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2757 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2758 %define SPLATB_REG SPLATB_REG_SSE2 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2759 %define WRITE_8W WRITE_8W_SSE2 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2760 MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2761 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2762 MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2763 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2764 MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2765 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2766 MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2767 MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 |
12210 | 2768 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2769 %define SPLATB_REG SPLATB_REG_SSSE3 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2770 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 |
12210 | 2771 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2772 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 |
12210 | 2773 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2774 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 |
12210 | 2775 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2776 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2777 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2778 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2779 %define WRITE_8W WRITE_8W_SSE4 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2780 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2781 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2782 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2783 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2784 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2785 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 |