Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12278:da5b503f050d libavcodec
VP8: Much faster SSE2 MC
5-10% faster or more on Phenom, Athlon 64, and some others.
Helps some on pre-SSSE3 Intel chips as well, but not as much.
author | darkshikari |
---|---|
date | Mon, 26 Jul 2010 19:34:00 +0000 |
parents | 1c299b8f2930 |
children | 7fb91885433c |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
11975 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
148 cextern pb_1 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
149 cextern pw_3 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
150 cextern pb_3 |
11975 | 151 cextern pw_4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
152 cextern pb_4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
153 cextern pw_9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
154 cextern pw_18 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
155 cextern pw_27 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
156 cextern pw_63 |
11975 | 157 cextern pw_64 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
158 cextern pb_80 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
159 cextern pb_F8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
160 cextern pb_FE |
11975 | 161 |
162 SECTION .text | |
163 | |
164 ;----------------------------------------------------------------------------- | |
165 ; subpel MC functions: | |
166 ; | |
167 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
168 ; uint8_t *src, int srcstride, | |
169 ; int height, int mx, int my); | |
170 ;----------------------------------------------------------------------------- | |
171 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
172 %macro FILTER_SSSE3 3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
173 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
174 lea r5d, [r5*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
175 mova m3, [filter_h6_shuf2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 mova m4, [filter_h6_shuf3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 mova m6, [sixtap_filter_hb+r5*8-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 mova m7, [sixtap_filter_hb+r5*8-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 movu m0, [r2-2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 mova m2, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 %ifidn %1, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 ; shuffle with a memory operand |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 punpcklbw m0, [r2+3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 %else |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 pshufb m0, [filter_h6_shuf1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 pshufb m1, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 pshufb m2, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 pmaddubsw m2, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 paddsw m0, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
214 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 shl r5d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 mova m2, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 mova m3, [filter_h2_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 mova m4, [filter_h4_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 mova m6, [fourtap_filter_hb+r5] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 movu m0, [r2-1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 pshufb m0, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 pshufb m1, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
245 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 shl r6d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 mova m5, [fourtap_filter_hb+r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 mova m6, [fourtap_filter_hb+r6] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 mova m7, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 ; read 3 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 movh m1, [r2+ r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 movh m2, [r2+2*r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 movh m3, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 mova m4, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 punpcklbw m4, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 punpcklbw m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 pmaddubsw m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 pmaddubsw m2, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 paddsw m4, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 paddsw m4, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 psraw m4, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 packuswb m4, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 movh [r0], m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
284 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 lea r6d, [r6*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 lea r6, [sixtap_filter_hb+r6*8] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 ; read 5 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 movh m1, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 movh m2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 lea r2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 movh m3, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 movh m4, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 movh m5, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 mova m6, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 punpcklbw m6, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 punpcklbw m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 mova m7, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 punpcklbw m7, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 pmaddubsw m6, [r6-48] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 pmaddubsw m1, [r6-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 pmaddubsw m7, [r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 paddsw m6, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 paddsw m6, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 paddsw m6, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 psraw m6, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 mova m3, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 packuswb m6, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 mova m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 movh [r0], m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
327 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
328 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
329 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
330 %endmacro |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
331 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
332 INIT_MMX |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
333 FILTER_SSSE3 4, 0, 0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
334 INIT_XMM |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
335 FILTER_SSSE3 8, 8, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
336 |
11975 | 337 ; 4x4 block, H-only 4-tap filter |
338 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
339 shl r5d, 4 | |
340 %ifdef PIC | |
341 lea r11, [fourtap_filter_hw_m] | |
342 %endif | |
343 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
344 movq mm5, [fourtap_filter_hw+r5] | |
345 movq mm7, [pw_64] | |
346 pxor mm6, mm6 | |
347 | |
348 .nextrow | |
349 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
350 | |
351 ; first set of 2 pixels | |
352 movq mm2, mm1 ; byte ABCD.. | |
353 punpcklbw mm1, mm6 ; byte->word ABCD | |
354 pshufw mm0, mm2, 9 ; byte CDEF.. | |
355 punpcklbw mm0, mm6 ; byte->word CDEF | |
356 pshufw mm3, mm1, 0x94 ; word ABBC | |
357 pshufw mm1, mm0, 0x94 ; word CDDE | |
358 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
359 movq mm0, mm1 ; backup for second set of pixels | |
360 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
361 paddd mm3, mm1 ; finish 1st 2px | |
362 | |
363 ; second set of 2 pixels, use backup of above | |
364 punpckhbw mm2, mm6 ; byte->word EFGH | |
365 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
366 pshufw mm1, mm2, 0x94 ; word EFFG | |
367 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
368 paddd mm0, mm1 ; finish 2nd 2px | |
369 | |
370 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
371 packssdw mm3, mm0 ; merge dword->word (4px) | |
372 paddsw mm3, mm7 ; rounding | |
373 psraw mm3, 7 | |
374 packuswb mm3, mm6 ; clip and word->bytes | |
375 movd [r0], mm3 ; store | |
376 | |
377 ; go to next line | |
378 add r0, r1 | |
379 add r2, r3 | |
380 dec r4 ; next row | |
381 jg .nextrow | |
382 REP_RET | |
383 | |
384 ; 4x4 block, H-only 6-tap filter | |
385 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
386 lea r5d, [r5*3] | |
387 %ifdef PIC | |
388 lea r11, [sixtap_filter_hw_m] | |
389 %endif | |
390 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
391 movq mm5, [sixtap_filter_hw+r5*8-32] | |
392 movq mm6, [sixtap_filter_hw+r5*8-16] | |
393 movq mm7, [pw_64] | |
394 pxor mm3, mm3 | |
395 | |
396 .nextrow | |
397 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
398 | |
399 ; first set of 2 pixels | |
400 movq mm2, mm1 ; byte ABCD.. | |
401 punpcklbw mm1, mm3 ; byte->word ABCD | |
402 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
403 punpckhbw mm2, mm3 ; byte->word EFGH | |
404 punpcklbw mm0, mm3 ; byte->word CDEF | |
405 pshufw mm1, mm1, 0x94 ; word ABBC | |
406 pshufw mm2, mm2, 0x94 ; word EFFG | |
407 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
408 pshufw mm3, mm0, 0x94 ; word CDDE | |
409 movq mm0, mm3 ; backup for second set of pixels | |
410 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
411 paddd mm1, mm3 ; add to 1st 2px cache | |
412 movq mm3, mm2 ; backup for second set of pixels | |
413 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
414 paddd mm1, mm2 ; finish 1st 2px | |
415 | |
416 ; second set of 2 pixels, use backup of above | |
417 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
418 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
419 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
420 paddd mm0, mm3 ; add to 2nd 2px cache | |
421 pxor mm3, mm3 | |
422 punpcklbw mm2, mm3 ; byte->word FGHI | |
423 pshufw mm2, mm2, 0xE9 ; word GHHI | |
424 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
425 paddd mm0, mm2 ; finish 2nd 2px | |
426 | |
427 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
428 packssdw mm1, mm0 ; merge dword->word (4px) | |
429 paddsw mm1, mm7 ; rounding | |
430 psraw mm1, 7 | |
431 packuswb mm1, mm3 ; clip and word->bytes | |
432 movd [r0], mm1 ; store | |
433 | |
434 ; go to next line | |
435 add r0, r1 | |
436 add r2, r3 | |
437 dec r4 ; next row | |
438 jg .nextrow | |
439 REP_RET | |
440 | |
441 INIT_XMM | |
12278 | 442 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 |
443 shl r5d, 5 | |
11975 | 444 %ifdef PIC |
12278 | 445 lea r11, [fourtap_filter_v_m] |
11975 | 446 %endif |
12278 | 447 lea r5, [fourtap_filter_v+r5-32] |
11975 | 448 pxor m7, m7 |
12278 | 449 mova m4, [pw_64] |
450 mova m5, [r5+ 0] | |
451 mova m6, [r5+16] | |
452 %ifdef m8 | |
453 mova m8, [r5+32] | |
454 mova m9, [r5+48] | |
455 %endif | |
11975 | 456 .nextrow |
12278 | 457 movq m0, [r2-1] |
458 movq m1, [r2-0] | |
459 movq m2, [r2+1] | |
460 movq m3, [r2+2] | |
461 punpcklbw m0, m7 | |
462 punpcklbw m1, m7 | |
463 punpcklbw m2, m7 | |
464 punpcklbw m3, m7 | |
465 pmullw m0, m5 | |
466 pmullw m1, m6 | |
467 %ifdef m8 | |
468 pmullw m2, m8 | |
469 pmullw m3, m9 | |
470 %else | |
471 pmullw m2, [r5+32] | |
472 pmullw m3, [r5+48] | |
473 %endif | |
474 paddsw m0, m1 | |
475 paddsw m2, m3 | |
476 paddsw m0, m2 | |
477 paddsw m0, m4 | |
11975 | 478 psraw m0, 7 |
479 packuswb m0, m7 | |
480 movh [r0], m0 ; store | |
481 | |
482 ; go to next line | |
483 add r0, r1 | |
484 add r2, r3 | |
485 dec r4 ; next row | |
486 jg .nextrow | |
487 REP_RET | |
488 | |
12278 | 489 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 |
11975 | 490 lea r5d, [r5*3] |
12278 | 491 shl r5d, 4 |
11975 | 492 %ifdef PIC |
12278 | 493 lea r11, [sixtap_filter_v_m] |
11975 | 494 %endif |
12278 | 495 lea r5, [sixtap_filter_v+r5-96] |
11975 | 496 pxor m7, m7 |
12278 | 497 mova m6, [pw_64] |
498 %ifdef m8 | |
499 mova m8, [r5+ 0] | |
500 mova m9, [r5+16] | |
501 mova m10, [r5+32] | |
502 mova m11, [r5+48] | |
503 mova m12, [r5+64] | |
504 mova m13, [r5+80] | |
505 %endif | |
11975 | 506 .nextrow |
12278 | 507 movq m0, [r2-2] |
508 movq m1, [r2-1] | |
509 movq m2, [r2-0] | |
510 movq m3, [r2+1] | |
511 movq m4, [r2+2] | |
512 movq m5, [r2+3] | |
513 punpcklbw m0, m7 | |
514 punpcklbw m1, m7 | |
515 punpcklbw m2, m7 | |
516 punpcklbw m3, m7 | |
517 punpcklbw m4, m7 | |
518 punpcklbw m5, m7 | |
519 %ifdef m8 | |
520 pmullw m0, m8 | |
521 pmullw m1, m9 | |
522 pmullw m2, m10 | |
523 pmullw m3, m11 | |
524 pmullw m4, m12 | |
525 pmullw m5, m13 | |
526 %else | |
527 pmullw m0, [r5+ 0] | |
528 pmullw m1, [r5+16] | |
529 pmullw m2, [r5+32] | |
530 pmullw m3, [r5+48] | |
531 pmullw m4, [r5+64] | |
532 pmullw m5, [r5+80] | |
533 %endif | |
534 paddsw m1, m4 | |
535 paddsw m0, m5 | |
536 paddsw m1, m2 | |
537 paddsw m0, m3 | |
538 paddsw m0, m1 | |
539 paddsw m0, m6 | |
11975 | 540 psraw m0, 7 |
541 packuswb m0, m7 | |
542 movh [r0], m0 ; store | |
543 | |
544 ; go to next line | |
545 add r0, r1 | |
546 add r2, r3 | |
547 dec r4 ; next row | |
548 jg .nextrow | |
549 REP_RET | |
550 | |
551 %macro FILTER_V 3 | |
552 ; 4x4 block, V-only 4-tap filter | |
553 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
554 shl r6d, 5 | |
555 %ifdef PIC | |
556 lea r11, [fourtap_filter_v_m] | |
557 %endif | |
558 lea r6, [fourtap_filter_v+r6-32] | |
559 mova m6, [pw_64] | |
560 pxor m7, m7 | |
561 mova m5, [r6+48] | |
562 | |
563 ; read 3 lines | |
564 sub r2, r3 | |
565 movh m0, [r2] | |
566 movh m1, [r2+ r3] | |
567 movh m2, [r2+2*r3] | |
568 add r2, r3 | |
569 punpcklbw m0, m7 | |
570 punpcklbw m1, m7 | |
571 punpcklbw m2, m7 | |
572 | |
573 .nextrow | |
574 ; first calculate negative taps (to prevent losing positive overflows) | |
575 movh m4, [r2+2*r3] ; read new row | |
576 punpcklbw m4, m7 | |
577 mova m3, m4 | |
578 pmullw m0, [r6+0] | |
579 pmullw m4, m5 | |
580 paddsw m4, m0 | |
581 | |
582 ; then calculate positive taps | |
583 mova m0, m1 | |
584 pmullw m1, [r6+16] | |
585 paddsw m4, m1 | |
586 mova m1, m2 | |
587 pmullw m2, [r6+32] | |
588 paddsw m4, m2 | |
589 mova m2, m3 | |
590 | |
591 ; round/clip/store | |
592 paddsw m4, m6 | |
593 psraw m4, 7 | |
594 packuswb m4, m7 | |
595 movh [r0], m4 | |
596 | |
597 ; go to next line | |
598 add r0, r1 | |
599 add r2, r3 | |
600 dec r4 ; next row | |
601 jg .nextrow | |
602 REP_RET | |
603 | |
604 | |
605 ; 4x4 block, V-only 6-tap filter | |
606 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
607 shl r6d, 4 | |
608 lea r6, [r6*3] | |
609 %ifdef PIC | |
610 lea r11, [sixtap_filter_v_m] | |
611 %endif | |
612 lea r6, [sixtap_filter_v+r6-96] | |
613 pxor m7, m7 | |
614 | |
615 ; read 5 lines | |
616 sub r2, r3 | |
617 sub r2, r3 | |
618 movh m0, [r2] | |
619 movh m1, [r2+r3] | |
620 movh m2, [r2+r3*2] | |
621 lea r2, [r2+r3*2] | |
622 add r2, r3 | |
623 movh m3, [r2] | |
624 movh m4, [r2+r3] | |
625 punpcklbw m0, m7 | |
626 punpcklbw m1, m7 | |
627 punpcklbw m2, m7 | |
628 punpcklbw m3, m7 | |
629 punpcklbw m4, m7 | |
630 | |
631 .nextrow | |
632 ; first calculate negative taps (to prevent losing positive overflows) | |
633 mova m5, m1 | |
634 pmullw m5, [r6+16] | |
635 mova m6, m4 | |
636 pmullw m6, [r6+64] | |
637 paddsw m6, m5 | |
638 | |
639 ; then calculate positive taps | |
640 movh m5, [r2+2*r3] ; read new row | |
641 punpcklbw m5, m7 | |
642 pmullw m0, [r6+0] | |
643 paddsw m6, m0 | |
644 mova m0, m1 | |
645 mova m1, m2 | |
646 pmullw m2, [r6+32] | |
647 paddsw m6, m2 | |
648 mova m2, m3 | |
649 pmullw m3, [r6+48] | |
650 paddsw m6, m3 | |
651 mova m3, m4 | |
652 mova m4, m5 | |
653 pmullw m5, [r6+80] | |
654 paddsw m6, m5 | |
655 | |
656 ; round/clip/store | |
657 paddsw m6, [pw_64] | |
658 psraw m6, 7 | |
659 packuswb m6, m7 | |
660 movh [r0], m6 | |
661 | |
662 ; go to next line | |
663 add r0, r1 | |
664 add r2, r3 | |
665 dec r4 ; next row | |
666 jg .nextrow | |
667 REP_RET | |
668 %endmacro | |
669 | |
670 INIT_MMX | |
671 FILTER_V mmxext, 4, 0 | |
672 INIT_XMM | |
673 FILTER_V sse2, 8, 8 | |
674 | |
11991 | 675 %macro FILTER_BILINEAR 3 |
676 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
677 mov r5d, 8*16 | |
678 shl r6d, 4 | |
679 sub r5d, r6d | |
680 %ifdef PIC | |
681 lea r11, [bilinear_filter_vw_m] | |
682 %endif | |
683 pxor m6, m6 | |
12000 | 684 mova m4, [bilinear_filter_vw+r5-16] |
685 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 686 .nextrow |
687 movh m0, [r2+r3*0] | |
688 movh m1, [r2+r3*1] | |
689 movh m3, [r2+r3*2] | |
690 punpcklbw m0, m6 | |
691 punpcklbw m1, m6 | |
692 punpcklbw m3, m6 | |
693 mova m2, m1 | |
694 pmullw m0, m4 | |
695 pmullw m1, m5 | |
696 pmullw m2, m4 | |
697 pmullw m3, m5 | |
698 paddsw m0, m1 | |
699 paddsw m2, m3 | |
700 psraw m0, 2 | |
701 psraw m2, 2 | |
702 pavgw m0, m6 | |
703 pavgw m2, m6 | |
704 %ifidn %1, mmxext | |
705 packuswb m0, m0 | |
706 packuswb m2, m2 | |
707 movh [r0+r1*0], m0 | |
708 movh [r0+r1*1], m2 | |
709 %else | |
710 packuswb m0, m2 | |
711 movh [r0+r1*0], m0 | |
712 movhps [r0+r1*1], m0 | |
713 %endif | |
714 | |
715 lea r0, [r0+r1*2] | |
716 lea r2, [r2+r3*2] | |
717 sub r4, 2 | |
718 jg .nextrow | |
719 REP_RET | |
720 | |
721 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
722 mov r6d, 8*16 | |
723 shl r5d, 4 | |
724 sub r6d, r5d | |
725 %ifdef PIC | |
726 lea r11, [bilinear_filter_vw_m] | |
727 %endif | |
728 pxor m6, m6 | |
12000 | 729 mova m4, [bilinear_filter_vw+r6-16] |
730 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 731 .nextrow |
732 movh m0, [r2+r3*0+0] | |
733 movh m1, [r2+r3*0+1] | |
734 movh m2, [r2+r3*1+0] | |
735 movh m3, [r2+r3*1+1] | |
736 punpcklbw m0, m6 | |
737 punpcklbw m1, m6 | |
738 punpcklbw m2, m6 | |
739 punpcklbw m3, m6 | |
740 pmullw m0, m4 | |
741 pmullw m1, m5 | |
742 pmullw m2, m4 | |
743 pmullw m3, m5 | |
744 paddsw m0, m1 | |
745 paddsw m2, m3 | |
746 psraw m0, 2 | |
747 psraw m2, 2 | |
748 pavgw m0, m6 | |
749 pavgw m2, m6 | |
750 %ifidn %1, mmxext | |
751 packuswb m0, m0 | |
752 packuswb m2, m2 | |
753 movh [r0+r1*0], m0 | |
754 movh [r0+r1*1], m2 | |
755 %else | |
756 packuswb m0, m2 | |
757 movh [r0+r1*0], m0 | |
758 movhps [r0+r1*1], m0 | |
759 %endif | |
760 | |
761 lea r0, [r0+r1*2] | |
762 lea r2, [r2+r3*2] | |
763 sub r4, 2 | |
764 jg .nextrow | |
765 REP_RET | |
766 %endmacro | |
767 | |
768 INIT_MMX | |
769 FILTER_BILINEAR mmxext, 4, 0 | |
770 INIT_XMM | |
771 FILTER_BILINEAR sse2, 8, 7 | |
772 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
773 %macro FILTER_BILINEAR_SSSE3 1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
774 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
11991 | 775 shl r6d, 4 |
776 %ifdef PIC | |
777 lea r11, [bilinear_filter_vb_m] | |
778 %endif | |
779 pxor m4, m4 | |
12000 | 780 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 781 .nextrow |
782 movh m0, [r2+r3*0] | |
783 movh m1, [r2+r3*1] | |
784 movh m2, [r2+r3*2] | |
785 punpcklbw m0, m1 | |
786 punpcklbw m1, m2 | |
787 pmaddubsw m0, m3 | |
788 pmaddubsw m1, m3 | |
789 psraw m0, 2 | |
790 psraw m1, 2 | |
791 pavgw m0, m4 | |
792 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
793 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
794 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
795 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
796 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
797 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
798 %else |
11991 | 799 packuswb m0, m1 |
800 movh [r0+r1*0], m0 | |
801 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
802 %endif |
11991 | 803 |
804 lea r0, [r0+r1*2] | |
805 lea r2, [r2+r3*2] | |
806 sub r4, 2 | |
807 jg .nextrow | |
808 REP_RET | |
809 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
810 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
11991 | 811 shl r5d, 4 |
812 %ifdef PIC | |
813 lea r11, [bilinear_filter_vb_m] | |
814 %endif | |
815 pxor m4, m4 | |
816 mova m2, [filter_h2_shuf] | |
12000 | 817 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 818 .nextrow |
819 movu m0, [r2+r3*0] | |
820 movu m1, [r2+r3*1] | |
821 pshufb m0, m2 | |
822 pshufb m1, m2 | |
823 pmaddubsw m0, m3 | |
824 pmaddubsw m1, m3 | |
825 psraw m0, 2 | |
826 psraw m1, 2 | |
827 pavgw m0, m4 | |
828 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
829 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
830 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
831 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
832 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
833 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
834 %else |
11991 | 835 packuswb m0, m1 |
836 movh [r0+r1*0], m0 | |
837 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
838 %endif |
11991 | 839 |
840 lea r0, [r0+r1*2] | |
841 lea r2, [r2+r3*2] | |
842 sub r4, 2 | |
843 jg .nextrow | |
844 REP_RET | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
845 %endmacro |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
846 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
847 INIT_MMX |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
848 FILTER_BILINEAR_SSSE3 4 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
849 INIT_XMM |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
850 FILTER_BILINEAR_SSSE3 8 |
11991 | 851 |
11992 | 852 cglobal put_vp8_pixels8_mmx, 5,5 |
853 .nextrow: | |
854 movq mm0, [r2+r3*0] | |
855 movq mm1, [r2+r3*1] | |
856 lea r2, [r2+r3*2] | |
857 movq [r0+r1*0], mm0 | |
858 movq [r0+r1*1], mm1 | |
859 lea r0, [r0+r1*2] | |
860 sub r4d, 2 | |
861 jg .nextrow | |
862 REP_RET | |
863 | |
864 cglobal put_vp8_pixels16_mmx, 5,5 | |
865 .nextrow: | |
866 movq mm0, [r2+r3*0+0] | |
867 movq mm1, [r2+r3*0+8] | |
868 movq mm2, [r2+r3*1+0] | |
869 movq mm3, [r2+r3*1+8] | |
870 lea r2, [r2+r3*2] | |
871 movq [r0+r1*0+0], mm0 | |
872 movq [r0+r1*0+8], mm1 | |
873 movq [r0+r1*1+0], mm2 | |
874 movq [r0+r1*1+8], mm3 | |
875 lea r0, [r0+r1*2] | |
876 sub r4d, 2 | |
877 jg .nextrow | |
878 REP_RET | |
879 | |
880 cglobal put_vp8_pixels16_sse, 5,5,2 | |
881 .nextrow: | |
882 movups xmm0, [r2+r3*0] | |
883 movups xmm1, [r2+r3*1] | |
884 lea r2, [r2+r3*2] | |
885 movaps [r0+r1*0], xmm0 | |
886 movaps [r0+r1*1], xmm1 | |
887 lea r0, [r0+r1*2] | |
888 sub r4d, 2 | |
889 jg .nextrow | |
890 REP_RET | |
891 | |
11975 | 892 ;----------------------------------------------------------------------------- |
893 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
894 ;----------------------------------------------------------------------------- | |
895 | |
12238 | 896 %macro ADD_DC 4 |
897 %4 m2, [r0+%3] | |
898 %4 m3, [r0+r2+%3] | |
899 %4 m4, [r1+%3] | |
900 %4 m5, [r1+r2+%3] | |
901 paddusb m2, %1 | |
902 paddusb m3, %1 | |
903 paddusb m4, %1 | |
904 paddusb m5, %1 | |
905 psubusb m2, %2 | |
906 psubusb m3, %2 | |
907 psubusb m4, %2 | |
908 psubusb m5, %2 | |
909 %4 [r0+%3], m2 | |
910 %4 [r0+r2+%3], m3 | |
911 %4 [r1+%3], m4 | |
912 %4 [r1+r2+%3], m5 | |
913 %endmacro | |
914 | |
915 INIT_MMX | |
11975 | 916 cglobal vp8_idct_dc_add_mmx, 3, 3 |
917 ; load data | |
12238 | 918 movd m0, [r1] |
11975 | 919 |
920 ; calculate DC | |
12238 | 921 paddw m0, [pw_4] |
922 pxor m1, m1 | |
923 psraw m0, 3 | |
924 movd [r1], m1 | |
925 psubw m1, m0 | |
926 packuswb m0, m0 | |
927 packuswb m1, m1 | |
928 punpcklbw m0, m0 | |
929 punpcklbw m1, m1 | |
930 punpcklwd m0, m0 | |
931 punpcklwd m1, m1 | |
11975 | 932 |
933 ; add DC | |
12238 | 934 lea r1, [r0+r2*2] |
935 ADD_DC m0, m1, 0, movh | |
11975 | 936 RET |
937 | |
12238 | 938 INIT_XMM |
11975 | 939 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
940 ; load data | |
12238 | 941 movd m0, [r1] |
942 pxor m1, m1 | |
943 | |
944 ; calculate DC | |
945 paddw m0, [pw_4] | |
946 movd [r1], m1 | |
947 lea r1, [r0+r2*2] | |
948 movd m2, [r0] | |
949 movd m3, [r0+r2] | |
950 movd m4, [r1] | |
951 movd m5, [r1+r2] | |
952 psraw m0, 3 | |
953 pshuflw m0, m0, 0 | |
954 punpcklqdq m0, m0 | |
955 punpckldq m2, m3 | |
956 punpckldq m4, m5 | |
957 punpcklbw m2, m1 | |
958 punpcklbw m4, m1 | |
959 paddw m2, m0 | |
960 paddw m4, m0 | |
961 packuswb m2, m4 | |
962 movd [r0], m2 | |
963 pextrd [r0+r2], m2, 1 | |
964 pextrd [r1], m2, 2 | |
965 pextrd [r1+r2], m2, 3 | |
966 RET | |
967 | |
968 ;----------------------------------------------------------------------------- | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
969 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
12238 | 970 ;----------------------------------------------------------------------------- |
971 | |
972 INIT_MMX | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
973 cglobal vp8_idct_dc_add4y_mmx, 3, 3 |
12238 | 974 ; load data |
975 movd m0, [r1+32*0] ; A | |
976 movd m1, [r1+32*2] ; C | |
977 punpcklwd m0, [r1+32*1] ; A B | |
978 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 979 punpckldq m0, m1 ; A B C D |
12238 | 980 pxor m6, m6 |
11975 | 981 |
982 ; calculate DC | |
12238 | 983 paddw m0, [pw_4] |
984 movd [r1+32*0], m6 | |
985 movd [r1+32*1], m6 | |
986 movd [r1+32*2], m6 | |
987 movd [r1+32*3], m6 | |
988 psraw m0, 3 | |
989 psubw m6, m0 | |
990 packuswb m0, m0 | |
991 packuswb m6, m6 | |
992 punpcklbw m0, m0 ; AABBCCDD | |
993 punpcklbw m6, m6 ; AABBCCDD | |
994 movq m1, m0 | |
995 movq m7, m6 | |
996 punpcklbw m0, m0 ; AAAABBBB | |
997 punpckhbw m1, m1 ; CCCCDDDD | |
998 punpcklbw m6, m6 ; AAAABBBB | |
999 punpckhbw m7, m7 ; CCCCDDDD | |
1000 | |
1001 ; add DC | |
1002 lea r1, [r0+r2*2] | |
1003 ADD_DC m0, m6, 0, mova | |
1004 ADD_DC m1, m7, 8, mova | |
1005 RET | |
1006 | |
1007 INIT_XMM | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1008 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |
12238 | 1009 ; load data |
1010 movd m0, [r1+32*0] ; A | |
1011 movd m1, [r1+32*2] ; C | |
1012 punpcklwd m0, [r1+32*1] ; A B | |
1013 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 1014 punpckldq m0, m1 ; A B C D |
12238 | 1015 pxor m1, m1 |
1016 | |
1017 ; calculate DC | |
1018 paddw m0, [pw_4] | |
1019 movd [r1+32*0], m1 | |
1020 movd [r1+32*1], m1 | |
1021 movd [r1+32*2], m1 | |
1022 movd [r1+32*3], m1 | |
1023 psraw m0, 3 | |
1024 psubw m1, m0 | |
1025 packuswb m0, m0 | |
1026 packuswb m1, m1 | |
1027 punpcklbw m0, m0 | |
1028 punpcklbw m1, m1 | |
1029 punpcklbw m0, m0 | |
1030 punpcklbw m1, m1 | |
1031 | |
1032 ; add DC | |
1033 lea r1, [r0+r2*2] | |
1034 ADD_DC m0, m1, 0, mova | |
11975 | 1035 RET |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1036 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1037 ;----------------------------------------------------------------------------- |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1038 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1039 ;----------------------------------------------------------------------------- |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1040 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1041 INIT_MMX |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1042 cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1043 ; load data |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1044 movd m0, [r1+32*0] ; A |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1045 movd m1, [r1+32*2] ; C |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1046 punpcklwd m0, [r1+32*1] ; A B |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1047 punpcklwd m1, [r1+32*3] ; C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1048 punpckldq m0, m1 ; A B C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1049 pxor m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1050 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1051 ; calculate DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1052 paddw m0, [pw_4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1053 movd [r1+32*0], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1054 movd [r1+32*1], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1055 movd [r1+32*2], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1056 movd [r1+32*3], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1057 psraw m0, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1058 psubw m6, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1059 packuswb m0, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1060 packuswb m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1061 punpcklbw m0, m0 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1062 punpcklbw m6, m6 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1063 movq m1, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1064 movq m7, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1065 punpcklbw m0, m0 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1066 punpckhbw m1, m1 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1067 punpcklbw m6, m6 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1068 punpckhbw m7, m7 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1069 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1070 ; add DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1071 lea r1, [r0+r2*2] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1072 ADD_DC m0, m6, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1073 lea r0, [r0+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1074 lea r1, [r1+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1075 ADD_DC m1, m7, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1076 RET |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1077 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1078 ;----------------------------------------------------------------------------- |
12013 | 1079 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
1080 ;----------------------------------------------------------------------------- | |
1081 | |
1082 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
1083 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
1084 %macro VP8_MULTIPLY_SUMSUB 4 | |
1085 mova %3, %1 | |
1086 mova %4, %2 | |
1087 pmulhw %3, m6 ;20091(1) | |
1088 pmulhw %4, m6 ;20091(2) | |
1089 paddw %3, %1 | |
1090 paddw %4, %2 | |
12018 | 1091 paddw %1, %1 |
1092 paddw %2, %2 | |
12013 | 1093 pmulhw %1, m7 ;35468(1) |
1094 pmulhw %2, m7 ;35468(2) | |
1095 psubw %1, %4 | |
1096 paddw %2, %3 | |
1097 %endmacro | |
1098 | |
1099 ; calculate x0=%1+%3; x1=%1-%3 | |
1100 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
1101 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
1102 ; %5/%6 are temporary registers | |
1103 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
1104 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
1105 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
1106 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
1107 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
1108 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
1109 SWAP %4, %1 | |
1110 SWAP %4, %3 | |
1111 %endmacro | |
1112 | |
1113 INIT_MMX | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1114 %macro VP8_IDCT_ADD 1 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1115 cglobal vp8_idct_add_%1, 3, 3 |
12013 | 1116 ; load block data |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1117 movq m0, [r1+ 0] |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1118 movq m1, [r1+ 8] |
12013 | 1119 movq m2, [r1+16] |
1120 movq m3, [r1+24] | |
1121 movq m6, [pw_20091] | |
1122 movq m7, [pw_17734] | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1123 %ifidn %1, sse |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1124 xorps xmm0, xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1125 movaps [r1+ 0], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1126 movaps [r1+16], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1127 %else |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1128 pxor m4, m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1129 movq [r1+ 0], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1130 movq [r1+ 8], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1131 movq [r1+16], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1132 movq [r1+24], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1133 %endif |
12013 | 1134 |
1135 ; actual IDCT | |
1136 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1137 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1138 paddw m0, [pw_4] | |
1139 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1140 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1141 | |
1142 ; store | |
1143 pxor m4, m4 | |
1144 lea r1, [r0+2*r2] | |
1145 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1146 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1147 | |
1148 RET | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1149 %endmacro |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1150 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1151 VP8_IDCT_ADD mmx |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1152 VP8_IDCT_ADD sse |
12013 | 1153 |
1154 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1155 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1156 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1157 |
12209 | 1158 %macro SCATTER_WHT 3 |
1159 movd r1d, m%1 | |
1160 movd r2d, m%2 | |
1161 mov [r0+2*16*(0+%3)], r1w | |
1162 mov [r0+2*16*(1+%3)], r2w | |
1163 shr r1d, 16 | |
1164 shr r2d, 16 | |
1165 psrlq m%1, 32 | |
1166 psrlq m%2, 32 | |
1167 mov [r0+2*16*(4+%3)], r1w | |
1168 mov [r0+2*16*(5+%3)], r2w | |
1169 movd r1d, m%1 | |
1170 movd r2d, m%2 | |
1171 mov [r0+2*16*(8+%3)], r1w | |
1172 mov [r0+2*16*(9+%3)], r2w | |
1173 shr r1d, 16 | |
1174 shr r2d, 16 | |
1175 mov [r0+2*16*(12+%3)], r1w | |
1176 mov [r0+2*16*(13+%3)], r2w | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1177 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1178 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1179 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1180 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1181 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1182 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1183 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1184 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1185 INIT_MMX |
12209 | 1186 cglobal vp8_luma_dc_wht_mmx, 2,3 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1187 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1188 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1189 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1190 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1191 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1192 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1193 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1194 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1195 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1196 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1197 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1198 psraw m3, 3 |
12209 | 1199 SCATTER_WHT 0, 1, 0 |
1200 SCATTER_WHT 2, 3, 2 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1201 RET |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1202 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1203 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1204 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1205 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1206 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1207 ; macro called with 7 mm register indexes as argument, and 4 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1208 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1209 ; first 4 mm registers will carry the transposed pixel data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1210 ; the other three are scratchspace (one would be sufficient, but this allows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1211 ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1212 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1213 ; first two regular registers are buf+4*stride and buf+5*stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1214 ; third is -stride, fourth is +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1215 %macro READ_8x4_INTERLEAVED 11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1216 ; interleave 8 (A-H) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1217 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1218 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1219 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1220 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1221 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1222 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1223 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1224 punpcklbw m%1, m%5 ; A/B interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1225 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1226 punpcklbw m%2, m%6 ; C/D interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1227 punpcklbw m%3, m%7 ; E/F interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1228 punpcklbw m%4, m%5 ; G/H interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1229 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1230 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1231 ; macro called with 7 mm register indexes as argument, and 5 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1232 ; first 11 mean the same as READ_8x4_TRANSPOSED above |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1233 ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1234 ; will be set to second regular register + 8*stride at the end |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1235 %macro READ_16x4_INTERLEAVED 12 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1236 ; transpose 16 (A-P) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1237 lea %12, [r0+8*r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1238 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1239 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1240 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1241 movd m%3, [%12+%10*4] ; I0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1242 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1243 movd m%4, [%12+%10*2] ; K0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1244 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1245 movd m%5, [%12+%10] ; L0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1246 movd m%7, [%12] ; M0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1247 add %12, %11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1248 punpcklbw m%1, m%3 ; A/I |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1249 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1250 punpcklbw m%2, m%4 ; C/K |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1251 punpcklbw m%6, m%5 ; D/L |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1252 punpcklbw m%3, m%7 ; E/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1253 punpcklbw m%2, m%6 ; C/D/K/L interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1254 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1255 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1256 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1257 movd m%4, [%12+%10*4] ; J0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1258 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1259 movd m%6, [%12] ; N0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1260 punpcklbw m%5, m%4 ; B/J |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1261 punpcklbw m%7, m%6 ; F/N |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1262 punpcklbw m%1, m%5 ; A/B/I/J interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1263 punpcklbw m%3, m%7 ; E/F/M/N interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1264 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1265 movd m%6, [%12+%11] ; O0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1266 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1267 movd m%7, [%12+%11*2] ; P0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1268 punpcklbw m%4, m%6 ; G/O |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1269 punpcklbw m%5, m%7 ; H/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1270 punpcklbw m%4, m%5 ; G/H/O/P interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1271 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1272 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1273 ; write 4 mm registers of 2 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1274 ; first four arguments are mm register indexes containing source data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1275 ; last four are registers containing buf+4*stride, buf+5*stride, |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1276 ; -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1277 %macro WRITE_4x2D 8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1278 ; write out (2 dwords per register) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1279 movd [%5+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1280 movd [%5+%7*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1281 movd [%5], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1282 movd [%6+%8], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1283 punpckhdq m%1, m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1284 punpckhdq m%2, m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1285 punpckhdq m%3, m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1286 punpckhdq m%4, m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1287 movd [%6+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1288 movd [%5+%7], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1289 movd [%6], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1290 movd [%6+%8*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1291 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1292 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1293 ; write 4 xmm registers of 4 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1294 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1295 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1296 ; we add 1*stride to the third regular registry in the process |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1297 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1298 ; same memory region), or 8 if they cover two separate buffers (third one points to |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1299 ; a different memory region than the first two), allowing for more optimal code for |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1300 ; the 16-width case |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1301 %macro WRITE_4x4D 10 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1302 ; write out (4 dwords per register), start with dwords zero |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1303 movd [%5+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1304 movd [%5], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1305 movd [%7+%8*4], m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1306 movd [%7], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1307 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1308 ; store dwords 1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1309 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1310 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1311 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1312 psrldq m%4, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1313 movd [%6+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1314 movd [%6], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1315 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1316 movd [%6+%9*4], m%3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1317 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1318 movd [%7+%9], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1319 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1320 ; write dwords 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1321 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1322 psrldq m%2, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1323 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1324 movd [%5+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1325 movd %5, m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1326 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1327 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1328 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1329 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1330 movd [%5+%8*2], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1331 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1332 movd [%6+%9], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1333 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1334 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1335 add %7, %9 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1336 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1337 ; store dwords 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1338 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1339 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1340 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1341 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1342 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1343 mov [%7+%8*4], %5d |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1344 movd [%6+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1345 %else |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1346 movd [%5+%8], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1347 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1348 movd [%6+%9*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1349 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1350 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1351 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1352 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1353 %macro SPLATB_REG_MMX 2-3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1354 movd %1, %2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1355 punpcklbw %1, %1 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1356 punpcklwd %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1357 punpckldq %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1358 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1359 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1360 %macro SPLATB_REG_MMXEXT 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1361 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1362 punpcklbw %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1363 pshufw %1, %1, 0x0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1364 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1365 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1366 %macro SPLATB_REG_SSE2 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1367 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1368 punpcklbw %1, %1 |
12210 | 1369 pshuflw %1, %1, 0x0 |
1370 punpcklqdq %1, %1 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1371 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1372 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1373 %macro SPLATB_REG_SSSE3 3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1374 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1375 pshufb %1, %3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1376 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1377 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1378 %macro SIMPLE_LOOPFILTER 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1379 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1380 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1381 mov r5, rsp ; backup stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1382 and rsp, ~(mmsize-1) ; align stack |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1383 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1384 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1385 mov r3, 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1386 %endif |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1387 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1388 %if mmsize == 16 |
12210 | 1389 pxor m0, m0 |
1390 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1391 %endif |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1392 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1393 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1394 ; set up indexes to address 4 rows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1395 mov r2, r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1396 neg r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1397 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1398 lea r0, [r0+4*r2-2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1399 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1400 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1401 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1402 %if mmsize == 8 ; mmx / mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1403 .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1404 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1405 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1406 ; read 4 half/full rows of pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1407 mova m0, [r0+r1*2] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1408 mova m1, [r0+r1] ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1409 mova m2, [r0] ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1410 mova m3, [r0+r2] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1411 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1412 lea r4, [r0+r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1413 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1414 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1415 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1416 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1417 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1418 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1419 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1420 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1421 mova [rsp], m0 ; store p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1422 mova [rsp+mmsize], m3 ; store q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1423 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1424 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1425 ; simple_limit |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1426 mova m5, m2 ; m5=backup of q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1427 mova m6, m1 ; m6=backup of p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1428 psubusb m1, m2 ; p0-q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1429 psubusb m2, m6 ; q0-p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1430 por m1, m2 ; FFABS(p0-q0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1431 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1432 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1433 mova m4, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1434 mova m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1435 psubusb m3, m0 ; q1-p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1436 psubusb m0, m4 ; p1-q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1437 por m3, m0 ; FFABS(p1-q1) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1438 mova m0, [pb_80] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1439 pxor m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1440 pxor m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1441 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1442 pand m3, [pb_FE] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1443 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1444 paddusb m3, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1445 psubusb m3, m7 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1446 pxor m1, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1447 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1448 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1449 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1450 mova m4, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1451 pxor m5, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1452 pxor m0, m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1453 psubsb m5, m0 ; q0-p0 (signed) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1454 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1455 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1456 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1457 pand m2, m3 ; apply filter mask (m3) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1458 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1459 mova m3, [pb_F8] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1460 mova m1, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1461 paddsb m2, [pb_4] ; f1<<3=a+4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1462 paddsb m1, [pb_3] ; f2<<3=a+3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1463 pand m2, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1464 pand m1, m3 ; cache f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1465 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1466 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1467 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1468 pcmpgtb m0, m2 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1469 psubb m3, m2 ; -f1<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1470 psrlq m2, 3 ; +f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1471 psrlq m3, 3 ; -f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1472 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1473 pandn m0, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1474 psubusb m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1475 paddusb m4, m3 ; q0-f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1476 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1477 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1478 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1479 pcmpgtb m0, m1 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1480 psubb m3, m1 ; -f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1481 psrlq m1, 3 ; +f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1482 psrlq m3, 3 ; -f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1483 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1484 pandn m0, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1485 paddusb m6, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1486 psubusb m6, m3 ; p0+f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1487 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1488 ; store |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1489 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1490 mova [r0], m4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1491 mova [r0+r1], m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1492 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1493 mova m0, [rsp] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1494 SWAP 2, 4 ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1495 SWAP 1, 6 ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1496 mova m3, [rsp+mmsize] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1497 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1498 TRANSPOSE4x4B 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1499 %if mmsize == 16 ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1500 add r3, r1 ; change from r4*8*stride to r0+8*stride |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1501 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1502 %else ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1503 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1504 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1505 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1506 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1507 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1508 ; next 8 pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1509 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1510 add r0, 8 ; advance 8 cols = pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1511 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1512 lea r0, [r0+r2*8] ; advance 8 rows = lines |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1513 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1514 dec r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1515 jg .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1516 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1517 REP_RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1518 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1519 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1520 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1521 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1522 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1523 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1524 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1525 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1526 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1527 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1528 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1529 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1530 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1531 %define SPLATB_REG SPLATB_REG_MMX |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1532 SIMPLE_LOOPFILTER mmx, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1533 SIMPLE_LOOPFILTER mmx, h, 6 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1534 %define SPLATB_REG SPLATB_REG_MMXEXT |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1535 SIMPLE_LOOPFILTER mmxext, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1536 SIMPLE_LOOPFILTER mmxext, h, 6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1537 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1538 %define SPLATB_REG SPLATB_REG_SSE2 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1539 SIMPLE_LOOPFILTER sse2, v, 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1540 SIMPLE_LOOPFILTER sse2, h, 6 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1541 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 1542 SIMPLE_LOOPFILTER ssse3, v, 3 |
1543 SIMPLE_LOOPFILTER ssse3, h, 6 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1544 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1545 ;----------------------------------------------------------------------------- |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1546 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1547 ; int flimE, int flimI, int hev_thr); |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1548 ;----------------------------------------------------------------------------- |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1549 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1550 %macro INNER_LOOPFILTER 5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1551 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1552 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1553 %define dst8_reg r1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1554 %define mstride_reg r2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1555 %define E_reg r3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1556 %define I_reg r4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1557 %define hev_thr_reg r5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1558 %else ; luma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1559 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1560 %define mstride_reg r1 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1561 %define E_reg r2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1562 %define I_reg r3 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1563 %define hev_thr_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1564 %ifdef m8 ; x86-64, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1565 %define dst8_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1566 %elif mmsize == 16 ; x86-32, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1567 %define dst8_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1568 %else ; x86-32, mmx/mmxext |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1569 %define cnt_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1570 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1571 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1572 %define dst_reg r0 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1573 %define stride_reg E_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1574 %define dst2_reg I_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1575 %ifndef m8 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1576 %define stack_reg hev_thr_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1577 %endif |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1578 |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1579 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1580 %if mmsize == 16 |
12210 | 1581 pxor m7, m7 |
1582 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1583 %endif |
12210 | 1584 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1585 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1586 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1587 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1588 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1589 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1590 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1591 ; align stack |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1592 mov stack_reg, rsp ; backup stack pointer |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1593 and rsp, ~(mmsize-1) ; align stack |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1594 %ifidn %2, v |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1595 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1596 ; [3]=hev() result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1597 %else ; h |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1598 sub rsp, mmsize * 5 ; extra storage space for transposes |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1599 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1600 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1601 %define flim_E [rsp] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1602 %define flim_I [rsp+mmsize] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1603 %define hev_thr [rsp+mmsize*2] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1604 %define mask_res [rsp+mmsize*3] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1605 %define p0backup [rsp+mmsize*3] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1606 %define q0backup [rsp+mmsize*4] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1607 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1608 mova flim_E, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1609 mova flim_I, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1610 mova hev_thr, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1611 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1612 %else ; sse2 on x86-64 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1613 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1614 %define flim_E m9 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1615 %define flim_I m10 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1616 %define hev_thr m11 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1617 %define mask_res m12 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1618 %define p0backup m12 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1619 %define q0backup m8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1620 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1621 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1622 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1623 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1624 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1625 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1626 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1627 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1628 mov cnt_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1629 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1630 mov stride_reg, mstride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1631 neg mstride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1632 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1633 lea dst_reg, [dst_reg + stride_reg*4-4] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1634 %if %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1635 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1636 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1637 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1638 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1639 %if mmsize == 8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1640 .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1641 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1642 ; read |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1643 lea dst2_reg, [dst_reg + stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1644 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1645 %if %4 == 8 && mmsize == 16 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1646 %define movrow movh |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1647 %else |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1648 %define movrow mova |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1649 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1650 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1651 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1652 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1653 movrow m5, [dst2_reg] ; q1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1654 movrow m6, [dst2_reg+ stride_reg] ; q2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1655 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1656 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1657 movhps m0, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1658 movhps m2, [dst8_reg+mstride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1659 add dst8_reg, stride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1660 movhps m1, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1661 movhps m5, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1662 movhps m6, [dst8_reg+ stride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1663 movhps m7, [dst8_reg+ stride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1664 add dst8_reg, mstride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1665 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1666 %elif mmsize == 8 ; mmx/mmxext (h) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1667 ; read 8 rows of 8px each |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1668 movu m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1669 movu m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1670 movu m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1671 movu m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1672 movu m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1673 movu m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1674 movu m6, [dst2_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1675 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1676 ; 8x8 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1677 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1678 mova q0backup, m1 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1679 movu m7, [dst2_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1680 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1681 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1682 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1683 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1684 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1685 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1686 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1687 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1688 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1689 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1690 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1691 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1692 %else ; sse2 (h) |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1693 %if %4 == 16 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1694 lea dst8_reg, [dst_reg + stride_reg*8] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1695 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1696 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1697 ; read 16 rows of 8px each, interleave |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1698 movh m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1699 movh m1, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1700 movh m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1701 movh m5, [dst8_reg+mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1702 movh m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1703 movh m6, [dst8_reg+mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1704 movh m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1705 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1706 punpcklbw m0, m1 ; A/I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1707 punpcklbw m2, m5 ; C/K |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1708 punpcklbw m3, m6 ; D/L |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1709 punpcklbw m4, m7 ; E/M |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1710 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1711 add dst8_reg, stride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1712 movh m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1713 movh m6, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1714 movh m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1715 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1716 punpcklbw m1, m6 ; B/J |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1717 punpcklbw m5, m7 ; F/N |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1718 movh m6, [dst2_reg+ stride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1719 movh m7, [dst8_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1720 punpcklbw m6, m7 ; G/O |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1721 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1722 ; 8x16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1723 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1724 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1725 SWAP 1, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1726 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1727 mova q0backup, m1 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1728 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1729 movh m7, [dst2_reg+ stride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1730 movh m1, [dst8_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1731 punpcklbw m7, m1 ; H/P |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1732 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1733 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1734 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1735 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1736 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1737 SWAP 1, 8 |
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1738 SWAP 2, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1739 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1740 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1741 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1742 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1743 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1744 %ifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1745 SWAP 5, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1746 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1747 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1748 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1749 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1750 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1751 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1752 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1753 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1754 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1755 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1756 mova m4, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1757 SWAP 4, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1758 psubusb m4, m0 ; p2-p3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1759 psubusb m0, m1 ; p3-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1760 por m0, m4 ; abs(p3-p2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1761 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1762 mova m4, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1763 SWAP 4, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1764 psubusb m4, m1 ; p1-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1765 psubusb m1, m2 ; p2-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1766 por m1, m4 ; abs(p2-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1767 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1768 mova m4, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1769 SWAP 4, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1770 psubusb m4, m7 ; q2-q3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1771 psubusb m7, m6 ; q3-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1772 por m7, m4 ; abs(q3-q2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1773 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1774 mova m4, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1775 SWAP 4, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1776 psubusb m4, m6 ; q1-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1777 psubusb m6, m5 ; q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1778 por m6, m4 ; abs(q2-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1779 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1780 %ifidn %1, mmx |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1781 mova m4, flim_I |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1782 pxor m3, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1783 psubusb m0, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1784 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1785 psubusb m7, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1786 psubusb m6, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1787 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1788 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1789 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1790 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1791 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1792 pand m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1793 pand m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1794 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1795 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1796 pmaxub m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1797 pmaxub m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1798 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1799 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1800 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1801 SWAP 7, 3 ; now m7 is zero |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1802 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1803 movrow m3, [dst_reg +mstride_reg] ; p0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1804 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1805 movhps m3, [dst8_reg+mstride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1806 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1807 %elifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1808 SWAP 3, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1809 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1810 mova m3, p0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1811 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1812 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1813 mova m1, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1814 SWAP 1, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1815 mova m6, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1816 SWAP 3, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1817 psubusb m1, m3 ; p1-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1818 psubusb m6, m2 ; p0-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1819 por m1, m6 ; abs(p1-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1820 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1821 mova m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1822 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1823 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1824 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1825 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1826 pand m0, m1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1827 mova mask_res, m6 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1828 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1829 pmaxub m0, m1 ; max_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1830 SWAP 1, 4 ; max_hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1831 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1832 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1833 SWAP 6, 4 ; now m6 is I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1834 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1835 movrow m4, [dst_reg] ; q0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1836 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1837 movhps m4, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1838 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1839 %elifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1840 SWAP 4, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1841 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1842 mova m4, q0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1843 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1844 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1845 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1846 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1847 SWAP 7, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1848 psubusb m1, m5 ; q0-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1849 psubusb m7, m4 ; q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1850 por m1, m7 ; abs(q1-q0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1851 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1852 mova m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1853 psubusb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1854 psubusb m7, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1855 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1856 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1857 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1858 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1859 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1860 pand m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1861 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1862 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1863 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1864 pmaxub m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1865 psubusb m0, flim_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1866 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1867 pcmpeqb m0, m7 ; max(abs(..)) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1868 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1869 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1870 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1871 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1872 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1873 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1874 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1875 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1876 ; simple_limit |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1877 mova m1, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1878 SWAP 1, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1879 mova m6, m4 ; keep copies of p0/q0 around for later use |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1880 SWAP 6, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1881 psubusb m1, m4 ; p0-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1882 psubusb m6, m3 ; q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1883 por m1, m6 ; abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1884 paddusb m1, m1 ; m1=2*abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1885 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1886 mova m7, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1887 SWAP 7, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1888 mova m6, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1889 SWAP 6, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1890 psubusb m7, m5 ; p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1891 psubusb m6, m2 ; q1-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1892 por m7, m6 ; abs(q1-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1893 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1894 pand m7, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1895 psrlq m7, 1 ; abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1896 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1897 psubusb m7, flim_E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1898 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1899 pand m0, m7 ; normal_limit result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1900 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1901 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1902 %ifdef m8 ; x86-64 && sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1903 mova m8, [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1904 %define pb_80_var m8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1905 %else ; x86-32 or mmx/mmxext |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1906 %define pb_80_var [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1907 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1908 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1909 mova m7, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1910 pxor m1, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1911 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1912 psubsb m1, m7 ; (signed) q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1913 mova m6, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1914 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1915 pxor m6, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1916 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1917 psubsb m6, m7 ; (signed) p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1918 mova m7, mask_res |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1919 pandn m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1920 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1921 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1922 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1923 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1924 pand m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1925 mova m1, [pb_F8] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1926 mova m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1927 paddsb m7, [pb_3] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1928 paddsb m6, [pb_4] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1929 pand m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1930 pand m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1931 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1932 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1933 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1934 pcmpgtb m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1935 psubb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1936 psrlq m7, 3 ; +f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1937 psrlq m0, 3 ; -f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1938 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1939 pandn m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1940 psubusb m3, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1941 paddusb m3, m1 ; p0+f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1942 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1943 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1944 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1945 pcmpgtb m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1946 psubb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1947 psrlq m6, 3 ; +f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1948 psrlq m1, 3 ; -f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1949 pand m1, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1950 pandn m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1951 psubusb m4, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1952 paddusb m4, m1 ; q0-f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1953 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1954 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1955 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1956 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1957 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1958 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1959 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1960 mova m7, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1961 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1962 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1963 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1964 pand m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1965 pand m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1966 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1967 paddusb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1968 pand m1, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1969 pandn m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1970 psrlq m1, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1971 psrlq m7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1972 SWAP 0, 7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1973 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1974 psubusb m1, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1975 pavgb m0, m7 ; a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1976 pavgb m1, m7 ; -a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1977 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1978 psubusb m5, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1979 psubusb m2, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1980 paddusb m5, m1 ; q1-a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1981 paddusb m2, m0 ; p1+a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1982 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1983 ; store |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1984 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1985 movrow [dst_reg +mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1986 movrow [dst_reg +mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1987 movrow [dst_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1988 movrow [dst_reg + stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1989 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1990 movhps [dst8_reg+mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1991 movhps [dst8_reg+mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1992 movhps [dst8_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1993 movhps [dst8_reg+ stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1994 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1995 %else ; h |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1996 add dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1997 add dst2_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1998 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1999 ; 4x8/16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2000 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2001 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2002 %if mmsize == 8 ; mmx/mmxext (h) |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2003 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2004 %else ; sse2 (h) |
12180 | 2005 lea dst8_reg, [dst8_reg+mstride_reg+2] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2006 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2007 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2008 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2009 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2010 %if mmsize == 8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2011 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2012 %ifidn %2, h |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2013 sub dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2014 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2015 cmp dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2016 mov dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2017 jnz .next8px |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2018 %else |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2019 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2020 lea dst_reg, [dst_reg + stride_reg*8-2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2021 %else ; v |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2022 add dst_reg, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2023 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2024 dec cnt_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2025 jg .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2026 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2027 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2028 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2029 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2030 mov rsp, stack_reg ; restore stack pointer |
12173
c47ddb7df424
Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents:
12168
diff
changeset
|
2031 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2032 RET |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2033 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2034 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2035 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2036 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2037 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
2038 INNER_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2039 INNER_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2040 INNER_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2041 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2042 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2043 INNER_LOOPFILTER mmxext, v, 6, 16, 0 |
2044 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | |
2045 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | |
2046 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2047 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2048 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2049 %define SPLATB_REG SPLATB_REG_SSE2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2050 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2051 %ifdef m8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2052 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2053 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2054 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2055 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2056 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2057 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2058 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2059 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 2060 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
2061 %ifdef m8 | |
2062 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | |
2063 %else | |
2064 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | |
2065 %endif | |
2066 INNER_LOOPFILTER ssse3, v, 6, 8, 13 | |
2067 INNER_LOOPFILTER ssse3, h, 6, 8, 13 | |
2068 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2069 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2070 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2071 ; int flimE, int flimI, int hev_thr); |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2072 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2073 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2074 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2075 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2076 ; for pre-SSE4: |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2077 ; 3 is a general-purpose register that we will clobber |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2078 ; for SSE4: |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2079 ; 3 is a pointer to the destination's 5th line |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2080 ; 4 is a pointer to the destination's 4th line |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2081 ; 5/6 is -stride and +stride |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2082 %macro WRITE_2x4W 6 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2083 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2084 punpckhdq %1, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2085 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2086 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2087 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2088 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2089 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2090 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2091 add %4, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2092 mov [%4+%5*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2093 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2094 mov [%4+%5 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2095 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2096 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2097 punpckhdq %2, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2098 mov [%4 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2099 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2100 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2101 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2102 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2103 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2104 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2105 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2106 mov [%4+%6*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2107 add %4, %5 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2108 %endmacro |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2109 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2110 %macro WRITE_8W_SSE2 5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2111 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2112 psrldq %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2113 mov [%3+%4*4], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2114 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2115 add %3, %5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2116 mov [%3+%4*4], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2117 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2118 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2119 psrldq %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2120 add %3, %4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2121 mov [%3+%4*2], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2122 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2123 mov [%3+%4 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2124 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2125 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2126 psrldq %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2127 mov [%3 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2128 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2129 mov [%3+%5 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2130 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2131 movd %2, %1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2132 add %3, %5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2133 mov [%3+%5 ], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2134 shr %2, 16 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2135 mov [%3+%5*2], %2w |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2136 %endmacro |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2137 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2138 %macro WRITE_8W_SSE4 5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2139 pextrw [%3+%4*4], %1, 0 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2140 pextrw [%2+%4*4], %1, 1 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2141 pextrw [%3+%4*2], %1, 2 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2142 pextrw [%3+%4 ], %1, 3 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2143 pextrw [%3 ], %1, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2144 pextrw [%2 ], %1, 5 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2145 pextrw [%2+%5 ], %1, 6 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2146 pextrw [%2+%5*2], %1, 7 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2147 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2148 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2149 %macro MBEDGE_LOOPFILTER 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2150 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2151 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2152 %define dst8_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2153 %define mstride_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2154 %define E_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2155 %define I_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2156 %define hev_thr_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2157 %else ; luma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2158 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2159 %define mstride_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2160 %define E_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2161 %define I_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2162 %define hev_thr_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2163 %ifdef m8 ; x86-64, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2164 %define dst8_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2165 %elif mmsize == 16 ; x86-32, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2166 %define dst8_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2167 %else ; x86-32, mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2168 %define cnt_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2169 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2170 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2171 %define dst_reg r0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2172 %define stride_reg E_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2173 %define dst2_reg I_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2174 %ifndef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2175 %define stack_reg hev_thr_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2176 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2177 |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2178 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2179 %if mmsize == 16 |
12210 | 2180 pxor m7, m7 |
2181 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2182 %endif |
12210 | 2183 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2184 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2185 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2186 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2187 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2188 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2189 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2190 ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2191 mov stack_reg, rsp ; backup stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2192 and rsp, ~(mmsize-1) ; align stack |
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2193 %if mmsize == 16 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2194 sub rsp, mmsize * 7 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2195 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2196 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2197 ; [3]=hev() result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2198 ; [4]=filter tmp result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2199 ; [5]/[6] = p2/q2 backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2200 ; [7]=lim_res sign result |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2201 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2202 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2203 %define flim_E [rsp] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2204 %define flim_I [rsp+mmsize] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2205 %define hev_thr [rsp+mmsize*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2206 %define mask_res [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2207 %define lim_res [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2208 %define p0backup [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2209 %define q0backup [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2210 %define p2backup [rsp+mmsize*5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2211 %define q2backup [rsp+mmsize*6] |
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2212 %if mmsize == 16 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2213 %define lim_sign [rsp] |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2214 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2215 %define lim_sign [rsp+mmsize*7] |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2216 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2217 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2218 mova flim_E, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2219 mova flim_I, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2220 mova hev_thr, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2221 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2222 %else ; sse2 on x86-64 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2223 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2224 %define flim_E m9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2225 %define flim_I m10 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2226 %define hev_thr m11 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2227 %define mask_res m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2228 %define lim_res m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2229 %define p0backup m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2230 %define q0backup m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2231 %define p2backup m13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2232 %define q2backup m14 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2233 %define lim_sign m9 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2234 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2235 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2236 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2237 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2238 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2239 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2240 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2241 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2242 mov cnt_reg, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2243 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2244 mov stride_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2245 neg mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2246 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2247 lea dst_reg, [dst_reg + stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2248 %if %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2249 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2250 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2251 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2252 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2253 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2254 .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2255 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2256 ; read |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2257 lea dst2_reg, [dst_reg + stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2258 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2259 %if %4 == 8 && mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2260 %define movrow movh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2261 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2262 %define movrow mova |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2263 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2264 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2265 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2266 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2267 movrow m5, [dst2_reg] ; q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2268 movrow m6, [dst2_reg+ stride_reg] ; q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2269 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2270 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2271 movhps m0, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2272 movhps m2, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2273 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2274 movhps m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2275 movhps m5, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2276 movhps m6, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2277 movhps m7, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2278 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2279 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2280 %elif mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2281 ; read 8 rows of 8px each |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2282 movu m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2283 movu m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2284 movu m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2285 movu m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2286 movu m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2287 movu m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2288 movu m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2289 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2290 ; 8x8 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2291 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2292 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2293 movu m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2294 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2295 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2296 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2297 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2298 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2299 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2300 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2301 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2302 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2303 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2304 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2305 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2306 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2307 %if %4 == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2308 lea dst8_reg, [dst_reg + stride_reg*8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2309 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2310 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2311 ; read 16 rows of 8px each, interleave |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2312 movh m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2313 movh m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2314 movh m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2315 movh m5, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2316 movh m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2317 movh m6, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2318 movh m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2319 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2320 punpcklbw m0, m1 ; A/I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2321 punpcklbw m2, m5 ; C/K |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2322 punpcklbw m3, m6 ; D/L |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2323 punpcklbw m4, m7 ; E/M |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2324 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2325 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2326 movh m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2327 movh m6, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2328 movh m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2329 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2330 punpcklbw m1, m6 ; B/J |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2331 punpcklbw m5, m7 ; F/N |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2332 movh m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2333 movh m7, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2334 punpcklbw m6, m7 ; G/O |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2335 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2336 ; 8x16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2337 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2338 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2339 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2340 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2341 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2342 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2343 movh m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2344 movh m1, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2345 punpcklbw m7, m1 ; H/P |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2346 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2347 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2348 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2349 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2350 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2351 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2352 SWAP 2, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2353 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2354 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2355 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2356 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2357 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2358 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2359 SWAP 5, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2360 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2361 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2362 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2363 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2364 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2365 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2366 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2367 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2368 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2369 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2370 mova m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2371 SWAP 4, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2372 psubusb m4, m0 ; p2-p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2373 psubusb m0, m1 ; p3-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2374 por m0, m4 ; abs(p3-p2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2375 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2376 mova m4, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2377 SWAP 4, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2378 psubusb m4, m1 ; p1-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2379 mova p2backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2380 psubusb m1, m2 ; p2-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2381 por m1, m4 ; abs(p2-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2382 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2383 mova m4, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2384 SWAP 4, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2385 psubusb m4, m7 ; q2-q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2386 psubusb m7, m6 ; q3-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2387 por m7, m4 ; abs(q3-q2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2388 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2389 mova m4, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2390 SWAP 4, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2391 psubusb m4, m6 ; q1-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2392 mova q2backup, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2393 psubusb m6, m5 ; q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2394 por m6, m4 ; abs(q2-q1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2395 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2396 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2397 mova m4, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2398 pxor m3, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2399 psubusb m0, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2400 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2401 psubusb m7, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2402 psubusb m6, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2403 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2404 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2405 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2406 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2407 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2408 pand m7, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2409 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2410 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2411 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2412 pmaxub m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2413 pmaxub m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2414 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2415 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2416 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2417 SWAP 7, 3 ; now m7 is zero |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2418 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2419 movrow m3, [dst_reg +mstride_reg] ; p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2420 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2421 movhps m3, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2422 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2423 %elifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2424 SWAP 3, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2425 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2426 mova m3, p0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2427 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2428 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2429 mova m1, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2430 SWAP 1, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2431 mova m6, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2432 SWAP 3, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2433 psubusb m1, m3 ; p1-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2434 psubusb m6, m2 ; p0-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2435 por m1, m6 ; abs(p1-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2436 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2437 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2438 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2439 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2440 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2441 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2442 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2443 mova mask_res, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2444 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2445 pmaxub m0, m1 ; max_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2446 SWAP 1, 4 ; max_hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2447 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2448 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2449 SWAP 6, 4 ; now m6 is I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2450 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2451 movrow m4, [dst_reg] ; q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2452 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2453 movhps m4, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2454 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2455 %elifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2456 SWAP 4, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2457 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2458 mova m4, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2459 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2460 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2461 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2462 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2463 SWAP 7, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2464 psubusb m1, m5 ; q0-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2465 psubusb m7, m4 ; q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2466 por m1, m7 ; abs(q1-q0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2467 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2468 mova m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2469 psubusb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2470 psubusb m7, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2471 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2472 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2473 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2474 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2475 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2476 pand m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2477 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2478 pxor m7, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2479 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2480 pmaxub m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2481 psubusb m0, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2482 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2483 pcmpeqb m0, m7 ; max(abs(..)) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2484 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2485 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2486 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2487 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2488 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2489 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2490 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2491 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2492 ; simple_limit |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2493 mova m1, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2494 SWAP 1, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2495 mova m6, m4 ; keep copies of p0/q0 around for later use |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2496 SWAP 6, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2497 psubusb m1, m4 ; p0-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2498 psubusb m6, m3 ; q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2499 por m1, m6 ; abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2500 paddusb m1, m1 ; m1=2*abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2501 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2502 mova m7, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2503 SWAP 7, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2504 mova m6, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2505 SWAP 6, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2506 psubusb m7, m5 ; p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2507 psubusb m6, m2 ; q1-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2508 por m7, m6 ; abs(q1-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2509 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2510 pand m7, [pb_FE] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2511 psrlq m7, 1 ; abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2512 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2513 psubusb m7, flim_E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2514 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2515 pand m0, m7 ; normal_limit result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2516 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2517 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2518 %ifdef m8 ; x86-64 && sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2519 mova m8, [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2520 %define pb_80_var m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2521 %else ; x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2522 %define pb_80_var [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2523 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2524 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2525 mova m7, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2526 pxor m1, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2527 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2528 psubsb m1, m7 ; (signed) q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2529 mova m6, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2530 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2531 pxor m6, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2532 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2533 psubsb m6, m7 ; (signed) p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2534 mova m7, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2535 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2536 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2537 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2538 pand m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2539 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2540 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2541 pand lim_res, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2542 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2543 mova m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2544 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2545 mova lim_res, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2546 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2547 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2548 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2549 mova m1, [pb_F8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2550 mova m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2551 paddsb m7, [pb_3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2552 paddsb m6, [pb_4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2553 pand m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2554 pand m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2555 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2556 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2557 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2558 pcmpgtb m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2559 psubb m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2560 psrlq m7, 3 ; +f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2561 psrlq m0, 3 ; -f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2562 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2563 pandn m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2564 psubusb m3, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2565 paddusb m3, m1 ; p0+f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2566 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2567 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2568 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2569 pcmpgtb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2570 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2571 psrlq m6, 3 ; +f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2572 psrlq m1, 3 ; -f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2573 pand m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2574 pandn m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2575 psubusb m4, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2576 paddusb m4, m1 ; q0-f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2577 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2578 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2579 mova m7, [pw_63] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2580 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2581 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2582 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2583 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2584 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2585 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2586 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2587 pcmpgtb m0, m1 ; which are negative |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2588 punpcklbw m6, m0 ; signed byte->word |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2589 punpckhbw m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2590 mova lim_sign, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2591 mova mask_res, m6 ; backup for later in filter |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2592 mova lim_res, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2593 pmullw m6, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2594 pmullw m1, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2595 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2596 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2597 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2598 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2599 packsswb m6, m1 ; a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2600 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2601 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2602 pand m1, m0 ; -a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2603 pandn m0, m6 ; +a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2604 psubusb m3, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2605 paddusb m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2606 paddusb m3, m0 ; p0+a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2607 psubusb m4, m0 ; q0-a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2608 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2609 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2610 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2611 mova m0, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2612 pmullw m6, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2613 pmullw m1, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2614 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2615 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2616 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2617 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2618 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2619 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2620 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2621 pand m1, m0 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2622 pandn m0, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2623 psubusb m2, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2624 paddusb m5, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2625 paddusb m2, m0 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2626 psubusb m5, m0 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2627 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2628 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2629 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2630 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2631 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2632 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2633 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2634 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2635 pmullw m6, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2636 pmullw m1, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2637 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2638 paddw m1, m7 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2639 %ifdef m9 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2640 SWAP 7, 9 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2641 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2642 mova m7, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2643 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2644 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2645 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2646 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2647 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2648 psubb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2649 pand m0, m7 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2650 pandn m7, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2651 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2652 SWAP 1, 13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2653 SWAP 6, 14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2654 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2655 mova m1, p2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2656 mova m6, q2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2657 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2658 psubusb m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2659 paddusb m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2660 paddusb m1, m7 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2661 psubusb m6, m7 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2662 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2663 ; store |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2664 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2665 movrow [dst2_reg+mstride_reg*4], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2666 movrow [dst_reg +mstride_reg*2], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2667 movrow [dst_reg +mstride_reg ], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2668 movrow [dst_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2669 movrow [dst2_reg], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2670 movrow [dst2_reg+ stride_reg ], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2671 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2672 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2673 movhps [dst8_reg+mstride_reg*2], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2674 movhps [dst8_reg+mstride_reg ], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2675 movhps [dst8_reg], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2676 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2677 movhps [dst8_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2678 movhps [dst8_reg+ stride_reg ], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2679 movhps [dst8_reg+ stride_reg*2], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2680 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2681 %else ; h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2682 inc dst_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2683 inc dst2_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2684 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2685 ; 4x8/16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2686 TRANSPOSE4x4B 1, 2, 3, 4, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2687 SBUTTERFLY bw, 5, 6, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2688 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2689 %if mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2690 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2691 add dst_reg, 4 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2692 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2693 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2694 lea dst8_reg, [dst8_reg+mstride_reg+1] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2695 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12214
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2696 lea dst_reg, [dst2_reg+mstride_reg+4] |
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2697 lea dst8_reg, [dst8_reg+mstride_reg+4] |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2698 %ifidn %1, sse4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2699 add dst2_reg, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2700 %endif |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2701 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2702 %ifidn %1, sse4 |
12268
259988e7ad0f
Fix obvious bug in assignment. Somehow, the test vectors don't test this...
rbultje
parents:
12266
diff
changeset
|
2703 lea dst2_reg, [dst8_reg+ stride_reg] |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2704 %endif |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2705 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2706 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2707 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2708 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2709 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2710 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2711 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2712 sub dst_reg, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2713 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2714 cmp dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2715 mov dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2716 jnz .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2717 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2718 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2719 lea dst_reg, [dst_reg + stride_reg*8-5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2720 %else ; v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2721 add dst_reg, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2722 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2723 dec cnt_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2724 jg .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2725 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2726 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2727 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2728 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2729 mov rsp, stack_reg ; restore stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2730 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2731 RET |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2732 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2733 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2734 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2735 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2736 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
2737 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2738 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2739 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2740 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2741 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2742 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |
2743 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | |
2744 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | |
2745 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2746 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2747 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2748 %define SPLATB_REG SPLATB_REG_SSE2 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2749 %define WRITE_8W WRITE_8W_SSE2 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2750 MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2751 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2752 MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2753 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2754 MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2755 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2756 MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2757 MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 |
12210 | 2758 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2759 %define SPLATB_REG SPLATB_REG_SSSE3 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2760 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 |
12210 | 2761 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2762 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 |
12210 | 2763 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2764 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 |
12210 | 2765 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2766 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2767 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2768 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2769 %define WRITE_8W WRITE_8W_SSE4 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2770 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2771 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2772 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2773 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2774 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2775 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 |