Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12240:e6ade5e849c9 libavcodec
VP8: Clean up some variable shadowing.
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 03:44:37 +0000 |
parents | 13b1ad24a4b1 |
children | c7f6ddcc5c01 |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
11975 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
148 cextern pb_1 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
149 cextern pw_3 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
150 cextern pb_3 |
11975 | 151 cextern pw_4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
152 cextern pb_4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
153 cextern pw_9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
154 cextern pw_18 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
155 cextern pw_27 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
156 cextern pw_63 |
11975 | 157 cextern pw_64 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
158 cextern pb_80 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
159 cextern pb_F8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
160 cextern pb_FE |
11975 | 161 |
162 SECTION .text | |
163 | |
164 ;----------------------------------------------------------------------------- | |
165 ; subpel MC functions: | |
166 ; | |
167 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
168 ; uint8_t *src, int srcstride, | |
169 ; int height, int mx, int my); | |
170 ;----------------------------------------------------------------------------- | |
171 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
172 %macro FILTER_SSSE3 3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
173 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
174 lea r5d, [r5*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
175 mova m3, [filter_h6_shuf2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 mova m4, [filter_h6_shuf3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 mova m6, [sixtap_filter_hb+r5*8-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 mova m7, [sixtap_filter_hb+r5*8-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 movu m0, [r2-2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 mova m2, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 %ifidn %1, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 ; shuffle with a memory operand |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 punpcklbw m0, [r2+3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 %else |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 pshufb m0, [filter_h6_shuf1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 pshufb m1, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 pshufb m2, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 pmaddubsw m2, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 paddsw m0, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
214 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 shl r5d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 mova m2, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 mova m3, [filter_h2_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 mova m4, [filter_h4_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 mova m6, [fourtap_filter_hb+r5] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 movu m0, [r2-1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 pshufb m0, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 pshufb m1, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
245 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 shl r6d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 mova m5, [fourtap_filter_hb+r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 mova m6, [fourtap_filter_hb+r6] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 mova m7, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 ; read 3 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 movh m1, [r2+ r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 movh m2, [r2+2*r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 movh m3, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 mova m4, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 punpcklbw m4, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 punpcklbw m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 pmaddubsw m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 pmaddubsw m2, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 paddsw m4, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 paddsw m4, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 psraw m4, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 packuswb m4, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 movh [r0], m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
284 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 lea r6d, [r6*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 lea r6, [sixtap_filter_hb+r6*8] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 ; read 5 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 movh m1, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 movh m2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 lea r2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 movh m3, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 movh m4, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 movh m5, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 mova m6, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 punpcklbw m6, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 punpcklbw m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 mova m7, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 punpcklbw m7, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 pmaddubsw m6, [r6-48] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 pmaddubsw m1, [r6-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 pmaddubsw m7, [r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 paddsw m6, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 paddsw m6, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 paddsw m6, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 psraw m6, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 mova m3, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 packuswb m6, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 mova m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 movh [r0], m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
327 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
328 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
329 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
330 %endmacro |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
331 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
332 INIT_MMX |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
333 FILTER_SSSE3 4, 0, 0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
334 INIT_XMM |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
335 FILTER_SSSE3 8, 8, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
336 |
11975 | 337 ; 4x4 block, H-only 4-tap filter |
338 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
339 shl r5d, 4 | |
340 %ifdef PIC | |
341 lea r11, [fourtap_filter_hw_m] | |
342 %endif | |
343 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
344 movq mm5, [fourtap_filter_hw+r5] | |
345 movq mm7, [pw_64] | |
346 pxor mm6, mm6 | |
347 | |
348 .nextrow | |
349 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
350 | |
351 ; first set of 2 pixels | |
352 movq mm2, mm1 ; byte ABCD.. | |
353 punpcklbw mm1, mm6 ; byte->word ABCD | |
354 pshufw mm0, mm2, 9 ; byte CDEF.. | |
355 punpcklbw mm0, mm6 ; byte->word CDEF | |
356 pshufw mm3, mm1, 0x94 ; word ABBC | |
357 pshufw mm1, mm0, 0x94 ; word CDDE | |
358 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
359 movq mm0, mm1 ; backup for second set of pixels | |
360 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
361 paddd mm3, mm1 ; finish 1st 2px | |
362 | |
363 ; second set of 2 pixels, use backup of above | |
364 punpckhbw mm2, mm6 ; byte->word EFGH | |
365 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
366 pshufw mm1, mm2, 0x94 ; word EFFG | |
367 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
368 paddd mm0, mm1 ; finish 2nd 2px | |
369 | |
370 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
371 packssdw mm3, mm0 ; merge dword->word (4px) | |
372 paddsw mm3, mm7 ; rounding | |
373 psraw mm3, 7 | |
374 packuswb mm3, mm6 ; clip and word->bytes | |
375 movd [r0], mm3 ; store | |
376 | |
377 ; go to next line | |
378 add r0, r1 | |
379 add r2, r3 | |
380 dec r4 ; next row | |
381 jg .nextrow | |
382 REP_RET | |
383 | |
384 ; 4x4 block, H-only 6-tap filter | |
385 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
386 lea r5d, [r5*3] | |
387 %ifdef PIC | |
388 lea r11, [sixtap_filter_hw_m] | |
389 %endif | |
390 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
391 movq mm5, [sixtap_filter_hw+r5*8-32] | |
392 movq mm6, [sixtap_filter_hw+r5*8-16] | |
393 movq mm7, [pw_64] | |
394 pxor mm3, mm3 | |
395 | |
396 .nextrow | |
397 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
398 | |
399 ; first set of 2 pixels | |
400 movq mm2, mm1 ; byte ABCD.. | |
401 punpcklbw mm1, mm3 ; byte->word ABCD | |
402 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
403 punpckhbw mm2, mm3 ; byte->word EFGH | |
404 punpcklbw mm0, mm3 ; byte->word CDEF | |
405 pshufw mm1, mm1, 0x94 ; word ABBC | |
406 pshufw mm2, mm2, 0x94 ; word EFFG | |
407 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
408 pshufw mm3, mm0, 0x94 ; word CDDE | |
409 movq mm0, mm3 ; backup for second set of pixels | |
410 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
411 paddd mm1, mm3 ; add to 1st 2px cache | |
412 movq mm3, mm2 ; backup for second set of pixels | |
413 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
414 paddd mm1, mm2 ; finish 1st 2px | |
415 | |
416 ; second set of 2 pixels, use backup of above | |
417 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
418 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
419 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
420 paddd mm0, mm3 ; add to 2nd 2px cache | |
421 pxor mm3, mm3 | |
422 punpcklbw mm2, mm3 ; byte->word FGHI | |
423 pshufw mm2, mm2, 0xE9 ; word GHHI | |
424 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
425 paddd mm0, mm2 ; finish 2nd 2px | |
426 | |
427 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
428 packssdw mm1, mm0 ; merge dword->word (4px) | |
429 paddsw mm1, mm7 ; rounding | |
430 psraw mm1, 7 | |
431 packuswb mm1, mm3 ; clip and word->bytes | |
432 movd [r0], mm1 ; store | |
433 | |
434 ; go to next line | |
435 add r0, r1 | |
436 add r2, r3 | |
437 dec r4 ; next row | |
438 jg .nextrow | |
439 REP_RET | |
440 | |
441 ; 4x4 block, H-only 4-tap filter | |
442 INIT_XMM | |
443 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
444 shl r5d, 4 | |
445 %ifdef PIC | |
446 lea r11, [fourtap_filter_hw_m] | |
447 %endif | |
448 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
449 mova m6, [fourtap_filter_hw+r5] | |
450 pxor m7, m7 | |
451 | |
452 .nextrow | |
453 movh m0, [r2-1] | |
454 punpcklbw m0, m7 ; ABCDEFGH | |
455 mova m1, m0 | |
456 mova m2, m0 | |
457 mova m3, m0 | |
458 psrldq m1, 2 ; BCDEFGH | |
459 psrldq m2, 4 ; CDEFGH | |
460 psrldq m3, 6 ; DEFGH | |
461 punpcklwd m0, m1 ; ABBCCDDE | |
462 punpcklwd m2, m3 ; CDDEEFFG | |
463 pmaddwd m0, m5 | |
464 pmaddwd m2, m6 | |
465 paddd m0, m2 | |
466 | |
467 movh m1, [r2+3] | |
468 punpcklbw m1, m7 ; ABCDEFGH | |
469 mova m2, m1 | |
470 mova m3, m1 | |
471 mova m4, m1 | |
472 psrldq m2, 2 ; BCDEFGH | |
473 psrldq m3, 4 ; CDEFGH | |
474 psrldq m4, 6 ; DEFGH | |
475 punpcklwd m1, m2 ; ABBCCDDE | |
476 punpcklwd m3, m4 ; CDDEEFFG | |
477 pmaddwd m1, m5 | |
478 pmaddwd m3, m6 | |
479 paddd m1, m3 | |
480 | |
481 packssdw m0, m1 | |
482 paddsw m0, [pw_64] | |
483 psraw m0, 7 | |
484 packuswb m0, m7 | |
485 movh [r0], m0 ; store | |
486 | |
487 ; go to next line | |
488 add r0, r1 | |
489 add r2, r3 | |
490 dec r4 ; next row | |
491 jg .nextrow | |
492 REP_RET | |
493 | |
494 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
495 lea r5d, [r5*3] | |
496 %ifdef PIC | |
497 lea r11, [sixtap_filter_hw_m] | |
498 %endif | |
499 lea r5, [sixtap_filter_hw+r5*8] | |
500 pxor m7, m7 | |
501 | |
502 .nextrow | |
503 movu m0, [r2-2] | |
504 mova m6, m0 | |
505 mova m4, m0 | |
506 punpcklbw m0, m7 ; ABCDEFGHI | |
507 mova m1, m0 | |
508 mova m2, m0 | |
509 mova m3, m0 | |
510 psrldq m1, 2 ; BCDEFGH | |
511 psrldq m2, 4 ; CDEFGH | |
512 psrldq m3, 6 ; DEFGH | |
513 psrldq m4, 4 | |
514 punpcklbw m4, m7 ; EFGH | |
515 mova m5, m4 | |
516 psrldq m5, 2 ; FGH | |
517 punpcklwd m0, m1 ; ABBCCDDE | |
518 punpcklwd m2, m3 ; CDDEEFFG | |
519 punpcklwd m4, m5 ; EFFGGHHI | |
520 pmaddwd m0, [r5-48] | |
521 pmaddwd m2, [r5-32] | |
522 pmaddwd m4, [r5-16] | |
523 paddd m0, m2 | |
524 paddd m0, m4 | |
525 | |
526 psrldq m6, 4 | |
527 mova m4, m6 | |
528 punpcklbw m6, m7 ; ABCDEFGHI | |
529 mova m1, m6 | |
530 mova m2, m6 | |
531 mova m3, m6 | |
532 psrldq m1, 2 ; BCDEFGH | |
533 psrldq m2, 4 ; CDEFGH | |
534 psrldq m3, 6 ; DEFGH | |
535 psrldq m4, 4 | |
536 punpcklbw m4, m7 ; EFGH | |
537 mova m5, m4 | |
538 psrldq m5, 2 ; FGH | |
539 punpcklwd m6, m1 ; ABBCCDDE | |
540 punpcklwd m2, m3 ; CDDEEFFG | |
541 punpcklwd m4, m5 ; EFFGGHHI | |
542 pmaddwd m6, [r5-48] | |
543 pmaddwd m2, [r5-32] | |
544 pmaddwd m4, [r5-16] | |
545 paddd m6, m2 | |
546 paddd m6, m4 | |
547 | |
548 packssdw m0, m6 | |
549 paddsw m0, [pw_64] | |
550 psraw m0, 7 | |
551 packuswb m0, m7 | |
552 movh [r0], m0 ; store | |
553 | |
554 ; go to next line | |
555 add r0, r1 | |
556 add r2, r3 | |
557 dec r4 ; next row | |
558 jg .nextrow | |
559 REP_RET | |
560 | |
561 %macro FILTER_V 3 | |
562 ; 4x4 block, V-only 4-tap filter | |
563 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
564 shl r6d, 5 | |
565 %ifdef PIC | |
566 lea r11, [fourtap_filter_v_m] | |
567 %endif | |
568 lea r6, [fourtap_filter_v+r6-32] | |
569 mova m6, [pw_64] | |
570 pxor m7, m7 | |
571 mova m5, [r6+48] | |
572 | |
573 ; read 3 lines | |
574 sub r2, r3 | |
575 movh m0, [r2] | |
576 movh m1, [r2+ r3] | |
577 movh m2, [r2+2*r3] | |
578 add r2, r3 | |
579 punpcklbw m0, m7 | |
580 punpcklbw m1, m7 | |
581 punpcklbw m2, m7 | |
582 | |
583 .nextrow | |
584 ; first calculate negative taps (to prevent losing positive overflows) | |
585 movh m4, [r2+2*r3] ; read new row | |
586 punpcklbw m4, m7 | |
587 mova m3, m4 | |
588 pmullw m0, [r6+0] | |
589 pmullw m4, m5 | |
590 paddsw m4, m0 | |
591 | |
592 ; then calculate positive taps | |
593 mova m0, m1 | |
594 pmullw m1, [r6+16] | |
595 paddsw m4, m1 | |
596 mova m1, m2 | |
597 pmullw m2, [r6+32] | |
598 paddsw m4, m2 | |
599 mova m2, m3 | |
600 | |
601 ; round/clip/store | |
602 paddsw m4, m6 | |
603 psraw m4, 7 | |
604 packuswb m4, m7 | |
605 movh [r0], m4 | |
606 | |
607 ; go to next line | |
608 add r0, r1 | |
609 add r2, r3 | |
610 dec r4 ; next row | |
611 jg .nextrow | |
612 REP_RET | |
613 | |
614 | |
615 ; 4x4 block, V-only 6-tap filter | |
616 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
617 shl r6d, 4 | |
618 lea r6, [r6*3] | |
619 %ifdef PIC | |
620 lea r11, [sixtap_filter_v_m] | |
621 %endif | |
622 lea r6, [sixtap_filter_v+r6-96] | |
623 pxor m7, m7 | |
624 | |
625 ; read 5 lines | |
626 sub r2, r3 | |
627 sub r2, r3 | |
628 movh m0, [r2] | |
629 movh m1, [r2+r3] | |
630 movh m2, [r2+r3*2] | |
631 lea r2, [r2+r3*2] | |
632 add r2, r3 | |
633 movh m3, [r2] | |
634 movh m4, [r2+r3] | |
635 punpcklbw m0, m7 | |
636 punpcklbw m1, m7 | |
637 punpcklbw m2, m7 | |
638 punpcklbw m3, m7 | |
639 punpcklbw m4, m7 | |
640 | |
641 .nextrow | |
642 ; first calculate negative taps (to prevent losing positive overflows) | |
643 mova m5, m1 | |
644 pmullw m5, [r6+16] | |
645 mova m6, m4 | |
646 pmullw m6, [r6+64] | |
647 paddsw m6, m5 | |
648 | |
649 ; then calculate positive taps | |
650 movh m5, [r2+2*r3] ; read new row | |
651 punpcklbw m5, m7 | |
652 pmullw m0, [r6+0] | |
653 paddsw m6, m0 | |
654 mova m0, m1 | |
655 mova m1, m2 | |
656 pmullw m2, [r6+32] | |
657 paddsw m6, m2 | |
658 mova m2, m3 | |
659 pmullw m3, [r6+48] | |
660 paddsw m6, m3 | |
661 mova m3, m4 | |
662 mova m4, m5 | |
663 pmullw m5, [r6+80] | |
664 paddsw m6, m5 | |
665 | |
666 ; round/clip/store | |
667 paddsw m6, [pw_64] | |
668 psraw m6, 7 | |
669 packuswb m6, m7 | |
670 movh [r0], m6 | |
671 | |
672 ; go to next line | |
673 add r0, r1 | |
674 add r2, r3 | |
675 dec r4 ; next row | |
676 jg .nextrow | |
677 REP_RET | |
678 %endmacro | |
679 | |
680 INIT_MMX | |
681 FILTER_V mmxext, 4, 0 | |
682 INIT_XMM | |
683 FILTER_V sse2, 8, 8 | |
684 | |
11991 | 685 %macro FILTER_BILINEAR 3 |
686 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
687 mov r5d, 8*16 | |
688 shl r6d, 4 | |
689 sub r5d, r6d | |
690 %ifdef PIC | |
691 lea r11, [bilinear_filter_vw_m] | |
692 %endif | |
693 pxor m6, m6 | |
12000 | 694 mova m4, [bilinear_filter_vw+r5-16] |
695 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 696 .nextrow |
697 movh m0, [r2+r3*0] | |
698 movh m1, [r2+r3*1] | |
699 movh m3, [r2+r3*2] | |
700 punpcklbw m0, m6 | |
701 punpcklbw m1, m6 | |
702 punpcklbw m3, m6 | |
703 mova m2, m1 | |
704 pmullw m0, m4 | |
705 pmullw m1, m5 | |
706 pmullw m2, m4 | |
707 pmullw m3, m5 | |
708 paddsw m0, m1 | |
709 paddsw m2, m3 | |
710 psraw m0, 2 | |
711 psraw m2, 2 | |
712 pavgw m0, m6 | |
713 pavgw m2, m6 | |
714 %ifidn %1, mmxext | |
715 packuswb m0, m0 | |
716 packuswb m2, m2 | |
717 movh [r0+r1*0], m0 | |
718 movh [r0+r1*1], m2 | |
719 %else | |
720 packuswb m0, m2 | |
721 movh [r0+r1*0], m0 | |
722 movhps [r0+r1*1], m0 | |
723 %endif | |
724 | |
725 lea r0, [r0+r1*2] | |
726 lea r2, [r2+r3*2] | |
727 sub r4, 2 | |
728 jg .nextrow | |
729 REP_RET | |
730 | |
731 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
732 mov r6d, 8*16 | |
733 shl r5d, 4 | |
734 sub r6d, r5d | |
735 %ifdef PIC | |
736 lea r11, [bilinear_filter_vw_m] | |
737 %endif | |
738 pxor m6, m6 | |
12000 | 739 mova m4, [bilinear_filter_vw+r6-16] |
740 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 741 .nextrow |
742 movh m0, [r2+r3*0+0] | |
743 movh m1, [r2+r3*0+1] | |
744 movh m2, [r2+r3*1+0] | |
745 movh m3, [r2+r3*1+1] | |
746 punpcklbw m0, m6 | |
747 punpcklbw m1, m6 | |
748 punpcklbw m2, m6 | |
749 punpcklbw m3, m6 | |
750 pmullw m0, m4 | |
751 pmullw m1, m5 | |
752 pmullw m2, m4 | |
753 pmullw m3, m5 | |
754 paddsw m0, m1 | |
755 paddsw m2, m3 | |
756 psraw m0, 2 | |
757 psraw m2, 2 | |
758 pavgw m0, m6 | |
759 pavgw m2, m6 | |
760 %ifidn %1, mmxext | |
761 packuswb m0, m0 | |
762 packuswb m2, m2 | |
763 movh [r0+r1*0], m0 | |
764 movh [r0+r1*1], m2 | |
765 %else | |
766 packuswb m0, m2 | |
767 movh [r0+r1*0], m0 | |
768 movhps [r0+r1*1], m0 | |
769 %endif | |
770 | |
771 lea r0, [r0+r1*2] | |
772 lea r2, [r2+r3*2] | |
773 sub r4, 2 | |
774 jg .nextrow | |
775 REP_RET | |
776 %endmacro | |
777 | |
778 INIT_MMX | |
779 FILTER_BILINEAR mmxext, 4, 0 | |
780 INIT_XMM | |
781 FILTER_BILINEAR sse2, 8, 7 | |
782 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
783 %macro FILTER_BILINEAR_SSSE3 1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
784 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
11991 | 785 shl r6d, 4 |
786 %ifdef PIC | |
787 lea r11, [bilinear_filter_vb_m] | |
788 %endif | |
789 pxor m4, m4 | |
12000 | 790 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 791 .nextrow |
792 movh m0, [r2+r3*0] | |
793 movh m1, [r2+r3*1] | |
794 movh m2, [r2+r3*2] | |
795 punpcklbw m0, m1 | |
796 punpcklbw m1, m2 | |
797 pmaddubsw m0, m3 | |
798 pmaddubsw m1, m3 | |
799 psraw m0, 2 | |
800 psraw m1, 2 | |
801 pavgw m0, m4 | |
802 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
803 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
804 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
805 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
806 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
807 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
808 %else |
11991 | 809 packuswb m0, m1 |
810 movh [r0+r1*0], m0 | |
811 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
812 %endif |
11991 | 813 |
814 lea r0, [r0+r1*2] | |
815 lea r2, [r2+r3*2] | |
816 sub r4, 2 | |
817 jg .nextrow | |
818 REP_RET | |
819 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
820 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
11991 | 821 shl r5d, 4 |
822 %ifdef PIC | |
823 lea r11, [bilinear_filter_vb_m] | |
824 %endif | |
825 pxor m4, m4 | |
826 mova m2, [filter_h2_shuf] | |
12000 | 827 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 828 .nextrow |
829 movu m0, [r2+r3*0] | |
830 movu m1, [r2+r3*1] | |
831 pshufb m0, m2 | |
832 pshufb m1, m2 | |
833 pmaddubsw m0, m3 | |
834 pmaddubsw m1, m3 | |
835 psraw m0, 2 | |
836 psraw m1, 2 | |
837 pavgw m0, m4 | |
838 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
839 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
840 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
841 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
842 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
843 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
844 %else |
11991 | 845 packuswb m0, m1 |
846 movh [r0+r1*0], m0 | |
847 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
848 %endif |
11991 | 849 |
850 lea r0, [r0+r1*2] | |
851 lea r2, [r2+r3*2] | |
852 sub r4, 2 | |
853 jg .nextrow | |
854 REP_RET | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
855 %endmacro |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
856 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
857 INIT_MMX |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
858 FILTER_BILINEAR_SSSE3 4 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
859 INIT_XMM |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
860 FILTER_BILINEAR_SSSE3 8 |
11991 | 861 |
11992 | 862 cglobal put_vp8_pixels8_mmx, 5,5 |
863 .nextrow: | |
864 movq mm0, [r2+r3*0] | |
865 movq mm1, [r2+r3*1] | |
866 lea r2, [r2+r3*2] | |
867 movq [r0+r1*0], mm0 | |
868 movq [r0+r1*1], mm1 | |
869 lea r0, [r0+r1*2] | |
870 sub r4d, 2 | |
871 jg .nextrow | |
872 REP_RET | |
873 | |
874 cglobal put_vp8_pixels16_mmx, 5,5 | |
875 .nextrow: | |
876 movq mm0, [r2+r3*0+0] | |
877 movq mm1, [r2+r3*0+8] | |
878 movq mm2, [r2+r3*1+0] | |
879 movq mm3, [r2+r3*1+8] | |
880 lea r2, [r2+r3*2] | |
881 movq [r0+r1*0+0], mm0 | |
882 movq [r0+r1*0+8], mm1 | |
883 movq [r0+r1*1+0], mm2 | |
884 movq [r0+r1*1+8], mm3 | |
885 lea r0, [r0+r1*2] | |
886 sub r4d, 2 | |
887 jg .nextrow | |
888 REP_RET | |
889 | |
890 cglobal put_vp8_pixels16_sse, 5,5,2 | |
891 .nextrow: | |
892 movups xmm0, [r2+r3*0] | |
893 movups xmm1, [r2+r3*1] | |
894 lea r2, [r2+r3*2] | |
895 movaps [r0+r1*0], xmm0 | |
896 movaps [r0+r1*1], xmm1 | |
897 lea r0, [r0+r1*2] | |
898 sub r4d, 2 | |
899 jg .nextrow | |
900 REP_RET | |
901 | |
11975 | 902 ;----------------------------------------------------------------------------- |
903 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
904 ;----------------------------------------------------------------------------- | |
905 | |
12238 | 906 %macro ADD_DC 4 |
907 %4 m2, [r0+%3] | |
908 %4 m3, [r0+r2+%3] | |
909 %4 m4, [r1+%3] | |
910 %4 m5, [r1+r2+%3] | |
911 paddusb m2, %1 | |
912 paddusb m3, %1 | |
913 paddusb m4, %1 | |
914 paddusb m5, %1 | |
915 psubusb m2, %2 | |
916 psubusb m3, %2 | |
917 psubusb m4, %2 | |
918 psubusb m5, %2 | |
919 %4 [r0+%3], m2 | |
920 %4 [r0+r2+%3], m3 | |
921 %4 [r1+%3], m4 | |
922 %4 [r1+r2+%3], m5 | |
923 %endmacro | |
924 | |
925 INIT_MMX | |
11975 | 926 cglobal vp8_idct_dc_add_mmx, 3, 3 |
927 ; load data | |
12238 | 928 movd m0, [r1] |
11975 | 929 |
930 ; calculate DC | |
12238 | 931 paddw m0, [pw_4] |
932 pxor m1, m1 | |
933 psraw m0, 3 | |
934 movd [r1], m1 | |
935 psubw m1, m0 | |
936 packuswb m0, m0 | |
937 packuswb m1, m1 | |
938 punpcklbw m0, m0 | |
939 punpcklbw m1, m1 | |
940 punpcklwd m0, m0 | |
941 punpcklwd m1, m1 | |
11975 | 942 |
943 ; add DC | |
12238 | 944 lea r1, [r0+r2*2] |
945 ADD_DC m0, m1, 0, movh | |
11975 | 946 RET |
947 | |
12238 | 948 INIT_XMM |
11975 | 949 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
950 ; load data | |
12238 | 951 movd m0, [r1] |
952 pxor m1, m1 | |
953 | |
954 ; calculate DC | |
955 paddw m0, [pw_4] | |
956 movd [r1], m1 | |
957 lea r1, [r0+r2*2] | |
958 movd m2, [r0] | |
959 movd m3, [r0+r2] | |
960 movd m4, [r1] | |
961 movd m5, [r1+r2] | |
962 psraw m0, 3 | |
963 pshuflw m0, m0, 0 | |
964 punpcklqdq m0, m0 | |
965 punpckldq m2, m3 | |
966 punpckldq m4, m5 | |
967 punpcklbw m2, m1 | |
968 punpcklbw m4, m1 | |
969 paddw m2, m0 | |
970 paddw m4, m0 | |
971 packuswb m2, m4 | |
972 movd [r0], m2 | |
973 pextrd [r0+r2], m2, 1 | |
974 pextrd [r1], m2, 2 | |
975 pextrd [r1+r2], m2, 3 | |
976 RET | |
977 | |
978 ;----------------------------------------------------------------------------- | |
979 ; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); | |
980 ;----------------------------------------------------------------------------- | |
981 | |
982 INIT_MMX | |
983 cglobal vp8_idct_dc_add4_mmx, 3, 3 | |
984 ; load data | |
985 movd m0, [r1+32*0] ; A | |
986 movd m1, [r1+32*2] ; C | |
987 punpcklwd m0, [r1+32*1] ; A B | |
988 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 989 punpckldq m0, m1 ; A B C D |
12238 | 990 pxor m6, m6 |
11975 | 991 |
992 ; calculate DC | |
12238 | 993 paddw m0, [pw_4] |
994 movd [r1+32*0], m6 | |
995 movd [r1+32*1], m6 | |
996 movd [r1+32*2], m6 | |
997 movd [r1+32*3], m6 | |
998 psraw m0, 3 | |
999 psubw m6, m0 | |
1000 packuswb m0, m0 | |
1001 packuswb m6, m6 | |
1002 punpcklbw m0, m0 ; AABBCCDD | |
1003 punpcklbw m6, m6 ; AABBCCDD | |
1004 movq m1, m0 | |
1005 movq m7, m6 | |
1006 punpcklbw m0, m0 ; AAAABBBB | |
1007 punpckhbw m1, m1 ; CCCCDDDD | |
1008 punpcklbw m6, m6 ; AAAABBBB | |
1009 punpckhbw m7, m7 ; CCCCDDDD | |
1010 | |
1011 ; add DC | |
1012 lea r1, [r0+r2*2] | |
1013 ADD_DC m0, m6, 0, mova | |
1014 ADD_DC m1, m7, 8, mova | |
1015 RET | |
1016 | |
1017 INIT_XMM | |
1018 cglobal vp8_idct_dc_add4_sse2, 3, 3 | |
1019 ; load data | |
1020 movd m0, [r1+32*0] ; A | |
1021 movd m1, [r1+32*2] ; C | |
1022 punpcklwd m0, [r1+32*1] ; A B | |
1023 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 1024 punpckldq m0, m1 ; A B C D |
12238 | 1025 pxor m1, m1 |
1026 | |
1027 ; calculate DC | |
1028 paddw m0, [pw_4] | |
1029 movd [r1+32*0], m1 | |
1030 movd [r1+32*1], m1 | |
1031 movd [r1+32*2], m1 | |
1032 movd [r1+32*3], m1 | |
1033 psraw m0, 3 | |
1034 psubw m1, m0 | |
1035 packuswb m0, m0 | |
1036 packuswb m1, m1 | |
1037 punpcklbw m0, m0 | |
1038 punpcklbw m1, m1 | |
1039 punpcklbw m0, m0 | |
1040 punpcklbw m1, m1 | |
1041 | |
1042 ; add DC | |
1043 lea r1, [r0+r2*2] | |
1044 ADD_DC m0, m1, 0, mova | |
11975 | 1045 RET |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1046 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1047 ;----------------------------------------------------------------------------- |
12013 | 1048 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
1049 ;----------------------------------------------------------------------------- | |
1050 | |
1051 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
1052 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
1053 %macro VP8_MULTIPLY_SUMSUB 4 | |
1054 mova %3, %1 | |
1055 mova %4, %2 | |
1056 pmulhw %3, m6 ;20091(1) | |
1057 pmulhw %4, m6 ;20091(2) | |
1058 paddw %3, %1 | |
1059 paddw %4, %2 | |
12018 | 1060 paddw %1, %1 |
1061 paddw %2, %2 | |
12013 | 1062 pmulhw %1, m7 ;35468(1) |
1063 pmulhw %2, m7 ;35468(2) | |
1064 psubw %1, %4 | |
1065 paddw %2, %3 | |
1066 %endmacro | |
1067 | |
1068 ; calculate x0=%1+%3; x1=%1-%3 | |
1069 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
1070 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
1071 ; %5/%6 are temporary registers | |
1072 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
1073 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
1074 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
1075 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
1076 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
1077 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
1078 SWAP %4, %1 | |
1079 SWAP %4, %3 | |
1080 %endmacro | |
1081 | |
1082 INIT_MMX | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1083 %macro VP8_IDCT_ADD 1 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1084 cglobal vp8_idct_add_%1, 3, 3 |
12013 | 1085 ; load block data |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1086 movq m0, [r1+ 0] |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1087 movq m1, [r1+ 8] |
12013 | 1088 movq m2, [r1+16] |
1089 movq m3, [r1+24] | |
1090 movq m6, [pw_20091] | |
1091 movq m7, [pw_17734] | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1092 %ifidn %1, sse |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1093 xorps xmm0, xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1094 movaps [r1+ 0], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1095 movaps [r1+16], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1096 %else |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1097 pxor m4, m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1098 movq [r1+ 0], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1099 movq [r1+ 8], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1100 movq [r1+16], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1101 movq [r1+24], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1102 %endif |
12013 | 1103 |
1104 ; actual IDCT | |
1105 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1106 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1107 paddw m0, [pw_4] | |
1108 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1109 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1110 | |
1111 ; store | |
1112 pxor m4, m4 | |
1113 lea r1, [r0+2*r2] | |
1114 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1115 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1116 | |
1117 RET | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1118 %endmacro |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1119 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1120 VP8_IDCT_ADD mmx |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1121 VP8_IDCT_ADD sse |
12013 | 1122 |
1123 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1124 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1125 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1126 |
12209 | 1127 %macro SCATTER_WHT 3 |
1128 movd r1d, m%1 | |
1129 movd r2d, m%2 | |
1130 mov [r0+2*16*(0+%3)], r1w | |
1131 mov [r0+2*16*(1+%3)], r2w | |
1132 shr r1d, 16 | |
1133 shr r2d, 16 | |
1134 psrlq m%1, 32 | |
1135 psrlq m%2, 32 | |
1136 mov [r0+2*16*(4+%3)], r1w | |
1137 mov [r0+2*16*(5+%3)], r2w | |
1138 movd r1d, m%1 | |
1139 movd r2d, m%2 | |
1140 mov [r0+2*16*(8+%3)], r1w | |
1141 mov [r0+2*16*(9+%3)], r2w | |
1142 shr r1d, 16 | |
1143 shr r2d, 16 | |
1144 mov [r0+2*16*(12+%3)], r1w | |
1145 mov [r0+2*16*(13+%3)], r2w | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1146 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1147 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1148 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1149 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1150 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1151 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1152 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1153 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1154 INIT_MMX |
12209 | 1155 cglobal vp8_luma_dc_wht_mmx, 2,3 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1156 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1157 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1158 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1159 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1160 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1161 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1162 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1163 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1164 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1165 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1166 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1167 psraw m3, 3 |
12209 | 1168 SCATTER_WHT 0, 1, 0 |
1169 SCATTER_WHT 2, 3, 2 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1170 RET |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1171 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1172 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1173 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1174 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1175 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1176 ; macro called with 7 mm register indexes as argument, and 4 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1177 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1178 ; first 4 mm registers will carry the transposed pixel data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1179 ; the other three are scratchspace (one would be sufficient, but this allows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1180 ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1181 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1182 ; first two regular registers are buf+4*stride and buf+5*stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1183 ; third is -stride, fourth is +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1184 %macro READ_8x4_INTERLEAVED 11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1185 ; interleave 8 (A-H) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1186 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1187 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1188 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1189 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1190 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1191 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1192 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1193 punpcklbw m%1, m%5 ; A/B interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1194 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1195 punpcklbw m%2, m%6 ; C/D interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1196 punpcklbw m%3, m%7 ; E/F interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1197 punpcklbw m%4, m%5 ; G/H interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1198 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1199 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1200 ; macro called with 7 mm register indexes as argument, and 5 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1201 ; first 11 mean the same as READ_8x4_TRANSPOSED above |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1202 ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1203 ; will be set to second regular register + 8*stride at the end |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1204 %macro READ_16x4_INTERLEAVED 12 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1205 ; transpose 16 (A-P) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1206 lea %12, [r0+8*r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1207 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1208 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1209 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1210 movd m%3, [%12+%10*4] ; I0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1211 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1212 movd m%4, [%12+%10*2] ; K0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1213 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1214 movd m%5, [%12+%10] ; L0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1215 movd m%7, [%12] ; M0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1216 add %12, %11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1217 punpcklbw m%1, m%3 ; A/I |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1218 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1219 punpcklbw m%2, m%4 ; C/K |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1220 punpcklbw m%6, m%5 ; D/L |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1221 punpcklbw m%3, m%7 ; E/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1222 punpcklbw m%2, m%6 ; C/D/K/L interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1223 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1224 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1225 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1226 movd m%4, [%12+%10*4] ; J0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1227 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1228 movd m%6, [%12] ; N0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1229 punpcklbw m%5, m%4 ; B/J |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1230 punpcklbw m%7, m%6 ; F/N |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1231 punpcklbw m%1, m%5 ; A/B/I/J interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1232 punpcklbw m%3, m%7 ; E/F/M/N interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1233 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1234 movd m%6, [%12+%11] ; O0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1235 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1236 movd m%7, [%12+%11*2] ; P0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1237 punpcklbw m%4, m%6 ; G/O |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1238 punpcklbw m%5, m%7 ; H/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1239 punpcklbw m%4, m%5 ; G/H/O/P interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1240 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1241 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1242 ; write 4 mm registers of 2 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1243 ; first four arguments are mm register indexes containing source data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1244 ; last four are registers containing buf+4*stride, buf+5*stride, |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1245 ; -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1246 %macro WRITE_4x2D 8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1247 ; write out (2 dwords per register) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1248 movd [%5+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1249 movd [%5+%7*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1250 movd [%5], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1251 movd [%6+%8], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1252 punpckhdq m%1, m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1253 punpckhdq m%2, m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1254 punpckhdq m%3, m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1255 punpckhdq m%4, m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1256 movd [%6+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1257 movd [%5+%7], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1258 movd [%6], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1259 movd [%6+%8*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1260 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1261 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1262 ; write 4 xmm registers of 4 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1263 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1264 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1265 ; we add 1*stride to the third regular registry in the process |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1266 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1267 ; same memory region), or 8 if they cover two separate buffers (third one points to |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1268 ; a different memory region than the first two), allowing for more optimal code for |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1269 ; the 16-width case |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1270 %macro WRITE_4x4D 10 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1271 ; write out (4 dwords per register), start with dwords zero |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1272 movd [%5+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1273 movd [%5], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1274 movd [%7+%8*4], m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1275 movd [%7], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1276 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1277 ; store dwords 1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1278 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1279 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1280 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1281 psrldq m%4, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1282 movd [%6+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1283 movd [%6], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1284 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1285 movd [%6+%9*4], m%3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1286 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1287 movd [%7+%9], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1288 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1289 ; write dwords 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1290 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1291 psrldq m%2, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1292 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1293 movd [%5+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1294 movd %5, m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1295 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1296 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1297 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1298 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1299 movd [%5+%8*2], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1300 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1301 movd [%6+%9], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1302 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1303 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1304 add %7, %9 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1305 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1306 ; store dwords 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1307 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1308 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1309 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1310 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1311 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1312 mov [%7+%8*4], %5d |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1313 movd [%6+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1314 %else |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1315 movd [%5+%8], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1316 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1317 movd [%6+%9*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1318 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1319 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1320 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1321 |
12210 | 1322 %macro SPLATB_REG 3-4 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1323 movd %1, %2 |
12210 | 1324 %ifidn %3, ssse3 |
1325 pshufb %1, %4 | |
1326 %else | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1327 punpcklbw %1, %1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1328 %if mmsize == 16 ; sse2 |
12210 | 1329 pshuflw %1, %1, 0x0 |
1330 punpcklqdq %1, %1 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1331 %elifidn %3, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1332 punpcklwd %1, %1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1333 punpckldq %1, %1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1334 %else ; mmxext |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1335 pshufw %1, %1, 0x0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1336 %endif |
12210 | 1337 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1338 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1339 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1340 %macro SIMPLE_LOOPFILTER 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1341 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1342 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1343 mov r5, rsp ; backup stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1344 and rsp, ~(mmsize-1) ; align stack |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1345 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1346 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1347 mov r3, 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1348 %endif |
12210 | 1349 %ifidn %1, ssse3 |
1350 pxor m0, m0 | |
1351 %endif | |
1352 SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register | |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1353 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1354 ; set up indexes to address 4 rows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1355 mov r2, r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1356 neg r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1357 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1358 lea r0, [r0+4*r2-2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1359 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1360 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1361 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1362 %if mmsize == 8 ; mmx / mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1363 .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1364 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1365 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1366 ; read 4 half/full rows of pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1367 mova m0, [r0+r1*2] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1368 mova m1, [r0+r1] ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1369 mova m2, [r0] ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1370 mova m3, [r0+r2] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1371 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1372 lea r4, [r0+r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1373 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1374 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1375 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1376 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1377 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1378 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1379 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1380 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1381 mova [rsp], m0 ; store p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1382 mova [rsp+mmsize], m3 ; store q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1383 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1384 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1385 ; simple_limit |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1386 mova m5, m2 ; m5=backup of q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1387 mova m6, m1 ; m6=backup of p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1388 psubusb m1, m2 ; p0-q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1389 psubusb m2, m6 ; q0-p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1390 por m1, m2 ; FFABS(p0-q0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1391 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1392 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1393 mova m4, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1394 mova m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1395 psubusb m3, m0 ; q1-p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1396 psubusb m0, m4 ; p1-q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1397 por m3, m0 ; FFABS(p1-q1) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1398 mova m0, [pb_80] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1399 pxor m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1400 pxor m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1401 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1402 pand m3, [pb_FE] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1403 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1404 paddusb m3, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1405 psubusb m3, m7 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1406 pxor m1, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1407 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1408 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1409 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1410 mova m4, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1411 pxor m5, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1412 pxor m0, m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1413 psubsb m5, m0 ; q0-p0 (signed) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1414 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1415 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1416 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1417 pand m2, m3 ; apply filter mask (m3) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1418 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1419 mova m3, [pb_F8] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1420 mova m1, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1421 paddsb m2, [pb_4] ; f1<<3=a+4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1422 paddsb m1, [pb_3] ; f2<<3=a+3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1423 pand m2, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1424 pand m1, m3 ; cache f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1425 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1426 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1427 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1428 pcmpgtb m0, m2 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1429 psubb m3, m2 ; -f1<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1430 psrlq m2, 3 ; +f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1431 psrlq m3, 3 ; -f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1432 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1433 pandn m0, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1434 psubusb m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1435 paddusb m4, m3 ; q0-f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1436 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1437 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1438 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1439 pcmpgtb m0, m1 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1440 psubb m3, m1 ; -f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1441 psrlq m1, 3 ; +f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1442 psrlq m3, 3 ; -f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1443 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1444 pandn m0, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1445 paddusb m6, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1446 psubusb m6, m3 ; p0+f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1447 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1448 ; store |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1449 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1450 mova [r0], m4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1451 mova [r0+r1], m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1452 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1453 mova m0, [rsp] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1454 SWAP 2, 4 ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1455 SWAP 1, 6 ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1456 mova m3, [rsp+mmsize] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1457 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1458 TRANSPOSE4x4B 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1459 %if mmsize == 16 ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1460 add r3, r1 ; change from r4*8*stride to r0+8*stride |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1461 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1462 %else ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1463 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1464 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1465 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1466 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1467 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1468 ; next 8 pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1469 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1470 add r0, 8 ; advance 8 cols = pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1471 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1472 lea r0, [r0+r2*8] ; advance 8 rows = lines |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1473 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1474 dec r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1475 jg .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1476 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1477 REP_RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1478 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1479 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1480 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1481 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1482 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1483 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1484 mov rsp, r5 ; restore stack pointer |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1485 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1486 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1487 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1488 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1489 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1490 INIT_MMX |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1491 SIMPLE_LOOPFILTER mmx, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1492 SIMPLE_LOOPFILTER mmx, h, 6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1493 SIMPLE_LOOPFILTER mmxext, v, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1494 SIMPLE_LOOPFILTER mmxext, h, 6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1495 INIT_XMM |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1496 SIMPLE_LOOPFILTER sse2, v, 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1497 SIMPLE_LOOPFILTER sse2, h, 6 |
12210 | 1498 SIMPLE_LOOPFILTER ssse3, v, 3 |
1499 SIMPLE_LOOPFILTER ssse3, h, 6 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1500 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1501 ;----------------------------------------------------------------------------- |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1502 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1503 ; int flimE, int flimI, int hev_thr); |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1504 ;----------------------------------------------------------------------------- |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1505 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1506 %macro INNER_LOOPFILTER 5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1507 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1508 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1509 %define dst8_reg r1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1510 %define mstride_reg r2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1511 %define E_reg r3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1512 %define I_reg r4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1513 %define hev_thr_reg r5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1514 %else ; luma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1515 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1516 %define mstride_reg r1 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1517 %define E_reg r2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1518 %define I_reg r3 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1519 %define hev_thr_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1520 %ifdef m8 ; x86-64, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1521 %define dst8_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1522 %elif mmsize == 16 ; x86-32, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1523 %define dst8_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1524 %else ; x86-32, mmx/mmxext |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1525 %define cnt_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1526 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1527 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1528 %define dst_reg r0 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1529 %define stride_reg E_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1530 %define dst2_reg I_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1531 %ifndef m8 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1532 %define stack_reg hev_thr_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1533 %endif |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1534 |
12210 | 1535 %ifidn %1, ssse3 |
1536 pxor m7, m7 | |
1537 %endif | |
1538 | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1539 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1540 ; splat function arguments |
12210 | 1541 SPLATB_REG m0, E_reg, %1, m7 ; E |
1542 SPLATB_REG m1, I_reg, %1, m7 ; I | |
1543 SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1544 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1545 ; align stack |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1546 mov stack_reg, rsp ; backup stack pointer |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1547 and rsp, ~(mmsize-1) ; align stack |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1548 %ifidn %2, v |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1549 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1550 ; [3]=hev() result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1551 %else ; h |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1552 sub rsp, mmsize * 5 ; extra storage space for transposes |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1553 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1554 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1555 %define flim_E [rsp] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1556 %define flim_I [rsp+mmsize] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1557 %define hev_thr [rsp+mmsize*2] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1558 %define mask_res [rsp+mmsize*3] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1559 %define p0backup [rsp+mmsize*3] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1560 %define q0backup [rsp+mmsize*4] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1561 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1562 mova flim_E, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1563 mova flim_I, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1564 mova hev_thr, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1565 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1566 %else ; sse2 on x86-64 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1567 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1568 %define flim_E m9 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1569 %define flim_I m10 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1570 %define hev_thr m11 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1571 %define mask_res m12 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1572 %define p0backup m12 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1573 %define q0backup m8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1574 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1575 ; splat function arguments |
12210 | 1576 SPLATB_REG flim_E, E_reg, %1, m7 ; E |
1577 SPLATB_REG flim_I, I_reg, %1, m7 ; I | |
1578 SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh | |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1579 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1580 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1581 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1582 mov cnt_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1583 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1584 mov stride_reg, mstride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1585 neg mstride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1586 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1587 lea dst_reg, [dst_reg + stride_reg*4-4] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1588 %if %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1589 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1590 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1591 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1592 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1593 %if mmsize == 8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1594 .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1595 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1596 ; read |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1597 lea dst2_reg, [dst_reg + stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1598 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1599 %if %4 == 8 && mmsize == 16 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1600 %define movrow movh |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1601 %else |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1602 %define movrow mova |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1603 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1604 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1605 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1606 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1607 movrow m5, [dst2_reg] ; q1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1608 movrow m6, [dst2_reg+ stride_reg] ; q2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1609 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1610 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1611 movhps m0, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1612 movhps m2, [dst8_reg+mstride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1613 add dst8_reg, stride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1614 movhps m1, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1615 movhps m5, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1616 movhps m6, [dst8_reg+ stride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1617 movhps m7, [dst8_reg+ stride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1618 add dst8_reg, mstride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1619 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1620 %elif mmsize == 8 ; mmx/mmxext (h) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1621 ; read 8 rows of 8px each |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1622 movu m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1623 movu m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1624 movu m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1625 movu m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1626 movu m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1627 movu m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1628 movu m6, [dst2_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1629 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1630 ; 8x8 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1631 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1632 mova q0backup, m1 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1633 movu m7, [dst2_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1634 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1635 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1636 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1637 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1638 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1639 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1640 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1641 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1642 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1643 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1644 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1645 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1646 %else ; sse2 (h) |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1647 %if %4 == 16 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1648 lea dst8_reg, [dst_reg + stride_reg*8] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1649 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1650 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1651 ; read 16 rows of 8px each, interleave |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1652 movh m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1653 movh m1, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1654 movh m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1655 movh m5, [dst8_reg+mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1656 movh m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1657 movh m6, [dst8_reg+mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1658 movh m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1659 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1660 punpcklbw m0, m1 ; A/I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1661 punpcklbw m2, m5 ; C/K |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1662 punpcklbw m3, m6 ; D/L |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1663 punpcklbw m4, m7 ; E/M |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1664 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1665 add dst8_reg, stride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1666 movh m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1667 movh m6, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1668 movh m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1669 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1670 punpcklbw m1, m6 ; B/J |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1671 punpcklbw m5, m7 ; F/N |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1672 movh m6, [dst2_reg+ stride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1673 movh m7, [dst8_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1674 punpcklbw m6, m7 ; G/O |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1675 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1676 ; 8x16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1677 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1678 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1679 SWAP 1, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1680 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1681 mova q0backup, m1 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1682 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1683 movh m7, [dst2_reg+ stride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1684 movh m1, [dst8_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1685 punpcklbw m7, m1 ; H/P |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1686 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1687 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1688 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1689 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1690 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1691 SWAP 1, 8 |
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1692 SWAP 2, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1693 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1694 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1695 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1696 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1697 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1698 %ifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1699 SWAP 5, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1700 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1701 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1702 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1703 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1704 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1705 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1706 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1707 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1708 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1709 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1710 mova m4, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1711 SWAP 4, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1712 psubusb m4, m0 ; p2-p3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1713 psubusb m0, m1 ; p3-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1714 por m0, m4 ; abs(p3-p2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1715 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1716 mova m4, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1717 SWAP 4, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1718 psubusb m4, m1 ; p1-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1719 psubusb m1, m2 ; p2-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1720 por m1, m4 ; abs(p2-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1721 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1722 mova m4, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1723 SWAP 4, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1724 psubusb m4, m7 ; q2-q3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1725 psubusb m7, m6 ; q3-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1726 por m7, m4 ; abs(q3-q2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1727 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1728 mova m4, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1729 SWAP 4, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1730 psubusb m4, m6 ; q1-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1731 psubusb m6, m5 ; q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1732 por m6, m4 ; abs(q2-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1733 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1734 %ifidn %1, mmx |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1735 mova m4, flim_I |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1736 pxor m3, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1737 psubusb m0, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1738 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1739 psubusb m7, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1740 psubusb m6, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1741 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1742 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1743 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1744 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1745 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1746 pand m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1747 pand m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1748 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1749 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1750 pmaxub m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1751 pmaxub m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1752 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1753 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1754 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1755 SWAP 7, 3 ; now m7 is zero |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1756 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1757 movrow m3, [dst_reg +mstride_reg] ; p0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1758 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1759 movhps m3, [dst8_reg+mstride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1760 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1761 %elifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1762 SWAP 3, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1763 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1764 mova m3, p0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1765 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1766 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1767 mova m1, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1768 SWAP 1, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1769 mova m6, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1770 SWAP 3, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1771 psubusb m1, m3 ; p1-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1772 psubusb m6, m2 ; p0-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1773 por m1, m6 ; abs(p1-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1774 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1775 mova m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1776 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1777 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1778 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1779 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1780 pand m0, m1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1781 mova mask_res, m6 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1782 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1783 pmaxub m0, m1 ; max_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1784 SWAP 1, 4 ; max_hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1785 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1786 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1787 SWAP 6, 4 ; now m6 is I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1788 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1789 movrow m4, [dst_reg] ; q0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1790 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1791 movhps m4, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1792 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1793 %elifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1794 SWAP 4, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1795 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1796 mova m4, q0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1797 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1798 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1799 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1800 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1801 SWAP 7, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1802 psubusb m1, m5 ; q0-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1803 psubusb m7, m4 ; q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1804 por m1, m7 ; abs(q1-q0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1805 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1806 mova m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1807 psubusb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1808 psubusb m7, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1809 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1810 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1811 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1812 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1813 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1814 pand m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1815 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1816 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1817 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1818 pmaxub m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1819 psubusb m0, flim_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1820 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1821 pcmpeqb m0, m7 ; max(abs(..)) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1822 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1823 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1824 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1825 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1826 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1827 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1828 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1829 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1830 ; simple_limit |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1831 mova m1, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1832 SWAP 1, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1833 mova m6, m4 ; keep copies of p0/q0 around for later use |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1834 SWAP 6, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1835 psubusb m1, m4 ; p0-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1836 psubusb m6, m3 ; q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1837 por m1, m6 ; abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1838 paddusb m1, m1 ; m1=2*abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1839 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1840 mova m7, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1841 SWAP 7, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1842 mova m6, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1843 SWAP 6, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1844 psubusb m7, m5 ; p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1845 psubusb m6, m2 ; q1-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1846 por m7, m6 ; abs(q1-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1847 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1848 pand m7, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1849 psrlq m7, 1 ; abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1850 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1851 psubusb m7, flim_E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1852 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1853 pand m0, m7 ; normal_limit result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1854 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1855 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1856 %ifdef m8 ; x86-64 && sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1857 mova m8, [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1858 %define pb_80_var m8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1859 %else ; x86-32 or mmx/mmxext |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1860 %define pb_80_var [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1861 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1862 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1863 mova m7, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1864 pxor m1, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1865 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1866 psubsb m1, m7 ; (signed) q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1867 mova m6, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1868 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1869 pxor m6, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1870 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1871 psubsb m6, m7 ; (signed) p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1872 mova m7, mask_res |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1873 pandn m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1874 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1875 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1876 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1877 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1878 pand m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1879 mova m1, [pb_F8] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1880 mova m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1881 paddsb m7, [pb_3] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1882 paddsb m6, [pb_4] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1883 pand m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1884 pand m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1885 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1886 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1887 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1888 pcmpgtb m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1889 psubb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1890 psrlq m7, 3 ; +f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1891 psrlq m0, 3 ; -f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1892 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1893 pandn m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1894 psubusb m3, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1895 paddusb m3, m1 ; p0+f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1896 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1897 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1898 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1899 pcmpgtb m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1900 psubb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1901 psrlq m6, 3 ; +f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1902 psrlq m1, 3 ; -f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1903 pand m1, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1904 pandn m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1905 psubusb m4, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1906 paddusb m4, m1 ; q0-f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1907 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1908 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1909 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1910 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1911 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1912 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1913 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1914 mova m7, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1915 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1916 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1917 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1918 pand m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1919 pand m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1920 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1921 paddusb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1922 pand m1, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1923 pandn m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1924 psrlq m1, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1925 psrlq m7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1926 SWAP 0, 7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1927 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1928 psubusb m1, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1929 pavgb m0, m7 ; a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1930 pavgb m1, m7 ; -a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1931 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1932 psubusb m5, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1933 psubusb m2, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1934 paddusb m5, m1 ; q1-a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1935 paddusb m2, m0 ; p1+a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1936 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1937 ; store |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1938 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1939 movrow [dst_reg +mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1940 movrow [dst_reg +mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1941 movrow [dst_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1942 movrow [dst_reg + stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1943 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1944 movhps [dst8_reg+mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1945 movhps [dst8_reg+mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1946 movhps [dst8_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1947 movhps [dst8_reg+ stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1948 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1949 %else ; h |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1950 add dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1951 add dst2_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1952 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1953 ; 4x8/16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1954 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1955 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1956 %if mmsize == 8 ; mmx/mmxext (h) |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1957 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1958 %else ; sse2 (h) |
12180 | 1959 lea dst8_reg, [dst8_reg+mstride_reg+2] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1960 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1961 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1962 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1963 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1964 %if mmsize == 8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1965 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1966 %ifidn %2, h |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1967 sub dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1968 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1969 cmp dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1970 mov dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1971 jnz .next8px |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1972 %else |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1973 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1974 lea dst_reg, [dst_reg + stride_reg*8-2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1975 %else ; v |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1976 add dst_reg, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1977 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1978 dec cnt_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1979 jg .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1980 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1981 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1982 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1983 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1984 mov rsp, stack_reg ; restore stack pointer |
12173
c47ddb7df424
Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents:
12168
diff
changeset
|
1985 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1986 RET |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1987 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1988 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1989 INIT_MMX |
12210 | 1990 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
1991 INNER_LOOPFILTER mmx, h, 6, 16, 0 | |
1992 INNER_LOOPFILTER mmxext, v, 6, 16, 0 | |
1993 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1994 |
12210 | 1995 INNER_LOOPFILTER mmx, v, 6, 8, 0 |
1996 INNER_LOOPFILTER mmx, h, 6, 8, 0 | |
1997 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | |
1998 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1999 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2000 INIT_XMM |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2001 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2002 %ifdef m8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2003 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2004 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2005 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2006 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2007 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2008 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2009 |
12210 | 2010 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
2011 %ifdef m8 | |
2012 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | |
2013 %else | |
2014 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | |
2015 %endif | |
2016 INNER_LOOPFILTER ssse3, v, 6, 8, 13 | |
2017 INNER_LOOPFILTER ssse3, h, 6, 8, 13 | |
2018 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2019 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2020 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2021 ; int flimE, int flimI, int hev_thr); |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2022 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2023 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2024 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2025 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2026 ; for pre-SSE4: |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2027 ; 3 is a general-purpose register that we will clobber |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2028 ; for SSE4: |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2029 ; 3 is a pointer to the destination's 5th line |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2030 ; 4 is a pointer to the destination's 4th line |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2031 ; 5/6 is -stride and +stride |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2032 ; 7 is optimization string |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2033 %macro WRITE_8W 7 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2034 %ifidn %7, sse4 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2035 pextrw [%4+%5*4], %1, 0 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2036 pextrw [%3+%5*4], %1, 1 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2037 pextrw [%4+%5*2], %1, 2 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2038 pextrw [%4+%5 ], %1, 3 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2039 pextrw [%4 ], %1, 4 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2040 pextrw [%3 ], %1, 5 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2041 pextrw [%3+%6 ], %1, 6 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2042 pextrw [%3+%6*2], %1, 7 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2043 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2044 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2045 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2046 punpckhdq %1, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2047 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2048 psrldq %1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2049 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2050 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2051 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2052 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2053 mov [%4+%5*4], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2054 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2055 movd %3, %1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2056 %if mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2057 psrldq %1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2058 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2059 add %4, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2060 mov [%4+%5*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2061 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2062 mov [%4+%5 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2063 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2064 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2065 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2066 punpckhdq %2, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2067 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2068 psrldq %2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2069 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2070 mov [%4 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2071 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2072 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2073 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2074 movd %3, %2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2075 add %4, %6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2076 mov [%4+%6 ], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2077 shr %3, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2078 mov [%4+%6*2], %3w |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2079 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2080 add %4, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2081 %endif |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2082 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2083 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2084 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2085 %macro MBEDGE_LOOPFILTER 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2086 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2087 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2088 %define dst8_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2089 %define mstride_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2090 %define E_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2091 %define I_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2092 %define hev_thr_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2093 %else ; luma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2094 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2095 %define mstride_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2096 %define E_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2097 %define I_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2098 %define hev_thr_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2099 %ifdef m8 ; x86-64, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2100 %define dst8_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2101 %elif mmsize == 16 ; x86-32, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2102 %define dst8_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2103 %else ; x86-32, mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2104 %define cnt_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2105 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2106 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2107 %define dst_reg r0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2108 %define stride_reg E_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2109 %define dst2_reg I_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2110 %ifndef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2111 %define stack_reg hev_thr_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2112 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2113 |
12210 | 2114 %ifidn %1, ssse3 |
2115 pxor m7, m7 | |
2116 %endif | |
2117 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2118 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2119 ; splat function arguments |
12210 | 2120 SPLATB_REG m0, E_reg, %1, m7 ; E |
2121 SPLATB_REG m1, I_reg, %1, m7 ; I | |
2122 SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2123 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2124 ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2125 mov stack_reg, rsp ; backup stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2126 and rsp, ~(mmsize-1) ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2127 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2128 ; [3]=hev() result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2129 ; [4]=filter tmp result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2130 ; [5]/[6] = p2/q2 backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2131 ; [7]=lim_res sign result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2132 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2133 %define flim_E [rsp] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2134 %define flim_I [rsp+mmsize] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2135 %define hev_thr [rsp+mmsize*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2136 %define mask_res [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2137 %define lim_res [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2138 %define p0backup [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2139 %define q0backup [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2140 %define p2backup [rsp+mmsize*5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2141 %define q2backup [rsp+mmsize*6] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2142 %define lim_sign [rsp+mmsize*7] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2143 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2144 mova flim_E, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2145 mova flim_I, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2146 mova hev_thr, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2147 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2148 %else ; sse2 on x86-64 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2149 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2150 %define flim_E m9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2151 %define flim_I m10 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2152 %define hev_thr m11 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2153 %define mask_res m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2154 %define lim_res m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2155 %define p0backup m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2156 %define q0backup m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2157 %define p2backup m13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2158 %define q2backup m14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2159 %define lim_sign m15 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2160 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2161 ; splat function arguments |
12210 | 2162 SPLATB_REG flim_E, E_reg, %1, m7 ; E |
2163 SPLATB_REG flim_I, I_reg, %1, m7 ; I | |
2164 SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2165 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2166 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2167 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2168 mov cnt_reg, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2169 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2170 mov stride_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2171 neg mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2172 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2173 lea dst_reg, [dst_reg + stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2174 %if %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2175 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2176 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2177 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2178 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2179 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2180 .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2181 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2182 ; read |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2183 lea dst2_reg, [dst_reg + stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2184 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2185 %if %4 == 8 && mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2186 %define movrow movh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2187 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2188 %define movrow mova |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2189 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2190 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2191 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2192 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2193 movrow m5, [dst2_reg] ; q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2194 movrow m6, [dst2_reg+ stride_reg] ; q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2195 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2196 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2197 movhps m0, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2198 movhps m2, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2199 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2200 movhps m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2201 movhps m5, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2202 movhps m6, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2203 movhps m7, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2204 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2205 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2206 %elif mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2207 ; read 8 rows of 8px each |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2208 movu m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2209 movu m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2210 movu m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2211 movu m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2212 movu m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2213 movu m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2214 movu m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2215 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2216 ; 8x8 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2217 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2218 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2219 movu m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2220 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2221 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2222 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2223 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2224 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2225 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2226 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2227 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2228 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2229 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2230 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2231 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2232 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2233 %if %4 == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2234 lea dst8_reg, [dst_reg + stride_reg*8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2235 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2236 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2237 ; read 16 rows of 8px each, interleave |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2238 movh m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2239 movh m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2240 movh m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2241 movh m5, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2242 movh m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2243 movh m6, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2244 movh m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2245 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2246 punpcklbw m0, m1 ; A/I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2247 punpcklbw m2, m5 ; C/K |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2248 punpcklbw m3, m6 ; D/L |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2249 punpcklbw m4, m7 ; E/M |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2250 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2251 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2252 movh m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2253 movh m6, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2254 movh m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2255 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2256 punpcklbw m1, m6 ; B/J |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2257 punpcklbw m5, m7 ; F/N |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2258 movh m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2259 movh m7, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2260 punpcklbw m6, m7 ; G/O |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2261 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2262 ; 8x16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2263 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2264 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2265 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2266 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2267 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2268 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2269 movh m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2270 movh m1, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2271 punpcklbw m7, m1 ; H/P |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2272 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2273 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2274 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2275 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2276 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2277 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2278 SWAP 2, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2279 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2280 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2281 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2282 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2283 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2284 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2285 SWAP 5, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2286 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2287 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2288 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2289 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2290 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2291 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2292 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2293 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2294 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2295 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2296 mova m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2297 SWAP 4, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2298 psubusb m4, m0 ; p2-p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2299 psubusb m0, m1 ; p3-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2300 por m0, m4 ; abs(p3-p2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2301 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2302 mova m4, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2303 SWAP 4, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2304 psubusb m4, m1 ; p1-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2305 mova p2backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2306 psubusb m1, m2 ; p2-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2307 por m1, m4 ; abs(p2-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2308 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2309 mova m4, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2310 SWAP 4, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2311 psubusb m4, m7 ; q2-q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2312 psubusb m7, m6 ; q3-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2313 por m7, m4 ; abs(q3-q2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2314 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2315 mova m4, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2316 SWAP 4, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2317 psubusb m4, m6 ; q1-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2318 mova q2backup, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2319 psubusb m6, m5 ; q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2320 por m6, m4 ; abs(q2-q1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2321 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2322 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2323 mova m4, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2324 pxor m3, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2325 psubusb m0, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2326 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2327 psubusb m7, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2328 psubusb m6, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2329 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2330 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2331 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2332 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2333 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2334 pand m7, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2335 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2336 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2337 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2338 pmaxub m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2339 pmaxub m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2340 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2341 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2342 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2343 SWAP 7, 3 ; now m7 is zero |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2344 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2345 movrow m3, [dst_reg +mstride_reg] ; p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2346 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2347 movhps m3, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2348 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2349 %elifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2350 SWAP 3, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2351 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2352 mova m3, p0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2353 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2354 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2355 mova m1, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2356 SWAP 1, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2357 mova m6, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2358 SWAP 3, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2359 psubusb m1, m3 ; p1-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2360 psubusb m6, m2 ; p0-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2361 por m1, m6 ; abs(p1-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2362 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2363 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2364 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2365 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2366 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2367 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2368 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2369 mova mask_res, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2370 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2371 pmaxub m0, m1 ; max_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2372 SWAP 1, 4 ; max_hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2373 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2374 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2375 SWAP 6, 4 ; now m6 is I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2376 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2377 movrow m4, [dst_reg] ; q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2378 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2379 movhps m4, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2380 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2381 %elifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2382 SWAP 4, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2383 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2384 mova m4, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2385 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2386 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2387 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2388 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2389 SWAP 7, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2390 psubusb m1, m5 ; q0-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2391 psubusb m7, m4 ; q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2392 por m1, m7 ; abs(q1-q0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2393 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2394 mova m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2395 psubusb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2396 psubusb m7, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2397 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2398 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2399 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2400 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2401 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2402 pand m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2403 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2404 pxor m7, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2405 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2406 pmaxub m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2407 psubusb m0, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2408 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2409 pcmpeqb m0, m7 ; max(abs(..)) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2410 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2411 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2412 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2413 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2414 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2415 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2416 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2417 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2418 ; simple_limit |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2419 mova m1, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2420 SWAP 1, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2421 mova m6, m4 ; keep copies of p0/q0 around for later use |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2422 SWAP 6, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2423 psubusb m1, m4 ; p0-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2424 psubusb m6, m3 ; q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2425 por m1, m6 ; abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2426 paddusb m1, m1 ; m1=2*abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2427 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2428 mova m7, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2429 SWAP 7, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2430 mova m6, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2431 SWAP 6, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2432 psubusb m7, m5 ; p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2433 psubusb m6, m2 ; q1-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2434 por m7, m6 ; abs(q1-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2435 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2436 pand m7, [pb_FE] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2437 psrlq m7, 1 ; abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2438 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2439 psubusb m7, flim_E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2440 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2441 pand m0, m7 ; normal_limit result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2442 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2443 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2444 %ifdef m8 ; x86-64 && sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2445 mova m8, [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2446 %define pb_80_var m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2447 %else ; x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2448 %define pb_80_var [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2449 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2450 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2451 mova m7, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2452 pxor m1, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2453 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2454 psubsb m1, m7 ; (signed) q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2455 mova m6, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2456 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2457 pxor m6, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2458 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2459 psubsb m6, m7 ; (signed) p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2460 mova m7, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2461 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2462 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2463 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2464 pand m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2465 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2466 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2467 pand lim_res, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2468 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2469 mova m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2470 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2471 mova lim_res, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2472 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2473 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2474 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2475 mova m1, [pb_F8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2476 mova m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2477 paddsb m7, [pb_3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2478 paddsb m6, [pb_4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2479 pand m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2480 pand m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2481 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2482 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2483 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2484 pcmpgtb m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2485 psubb m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2486 psrlq m7, 3 ; +f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2487 psrlq m0, 3 ; -f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2488 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2489 pandn m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2490 psubusb m3, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2491 paddusb m3, m1 ; p0+f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2492 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2493 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2494 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2495 pcmpgtb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2496 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2497 psrlq m6, 3 ; +f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2498 psrlq m1, 3 ; -f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2499 pand m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2500 pandn m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2501 psubusb m4, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2502 paddusb m4, m1 ; q0-f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2503 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2504 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2505 mova m7, [pw_63] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2506 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2507 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2508 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2509 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2510 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2511 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2512 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2513 pcmpgtb m0, m1 ; which are negative |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2514 punpcklbw m6, m0 ; signed byte->word |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2515 punpckhbw m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2516 mova lim_sign, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2517 mova mask_res, m6 ; backup for later in filter |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2518 mova lim_res, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2519 pmullw m6, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2520 pmullw m1, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2521 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2522 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2523 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2524 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2525 packsswb m6, m1 ; a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2526 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2527 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2528 pand m1, m0 ; -a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2529 pandn m0, m6 ; +a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2530 psubusb m3, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2531 paddusb m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2532 paddusb m3, m0 ; p0+a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2533 psubusb m4, m0 ; q0-a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2534 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2535 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2536 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2537 mova m0, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2538 pmullw m6, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2539 pmullw m1, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2540 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2541 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2542 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2543 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2544 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2545 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2546 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2547 pand m1, m0 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2548 pandn m0, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2549 psubusb m2, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2550 paddusb m5, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2551 paddusb m2, m0 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2552 psubusb m5, m0 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2553 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2554 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2555 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2556 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2557 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2558 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2559 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2560 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2561 pmullw m6, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2562 pmullw m1, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2563 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2564 paddw m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2565 %ifdef m15 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2566 SWAP 7, 15 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2567 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2568 mova m7, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2569 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2570 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2571 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2572 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2573 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2574 psubb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2575 pand m0, m7 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2576 pandn m7, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2577 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2578 SWAP 1, 13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2579 SWAP 6, 14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2580 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2581 mova m1, p2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2582 mova m6, q2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2583 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2584 psubusb m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2585 paddusb m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2586 paddusb m1, m7 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2587 psubusb m6, m7 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2588 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2589 ; store |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2590 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2591 movrow [dst2_reg+mstride_reg*4], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2592 movrow [dst_reg +mstride_reg*2], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2593 movrow [dst_reg +mstride_reg ], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2594 movrow [dst_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2595 movrow [dst2_reg], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2596 movrow [dst2_reg+ stride_reg ], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2597 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2598 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2599 movhps [dst8_reg+mstride_reg*2], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2600 movhps [dst8_reg+mstride_reg ], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2601 movhps [dst8_reg], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2602 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2603 movhps [dst8_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2604 movhps [dst8_reg+ stride_reg ], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2605 movhps [dst8_reg+ stride_reg*2], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2606 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2607 %else ; h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2608 inc dst_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2609 inc dst2_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2610 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2611 ; 4x8/16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2612 TRANSPOSE4x4B 1, 2, 3, 4, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2613 SBUTTERFLY bw, 5, 6, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2614 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2615 %if mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2616 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2617 add dst_reg, 4 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2618 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2619 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2620 lea dst8_reg, [dst8_reg+mstride_reg+1] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2621 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12214
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2622 lea dst_reg, [dst2_reg+mstride_reg+4] |
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2623 lea dst8_reg, [dst8_reg+mstride_reg+4] |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2624 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2625 %ifidn %2, sse4 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2626 lea dst_reg, [dst8_reg+ stride_reg] |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2627 %endif |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2628 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2629 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2630 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2631 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2632 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2633 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2634 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2635 sub dst_reg, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2636 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2637 cmp dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2638 mov dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2639 jnz .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2640 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2641 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2642 lea dst_reg, [dst_reg + stride_reg*8-5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2643 %else ; v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2644 add dst_reg, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2645 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2646 dec cnt_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2647 jg .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2648 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2649 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2650 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2651 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2652 mov rsp, stack_reg ; restore stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2653 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2654 RET |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2655 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2656 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2657 INIT_MMX |
12210 | 2658 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
2659 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | |
2660 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 | |
2661 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2662 |
12210 | 2663 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |
2664 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 | |
2665 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | |
2666 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2667 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2668 INIT_XMM |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2669 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2670 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2671 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2672 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2673 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2674 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2675 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2676 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16 |
12210 | 2677 |
2678 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16 | |
2679 %ifdef m8 | |
2680 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16 | |
2681 %else | |
2682 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 | |
2683 %endif | |
2684 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 | |
2685 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 | |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2686 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2687 %ifdef m8 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2688 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2689 %else |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2690 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2691 %endif |
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2692 MBEDGE_LOOPFILTER sse4, h, 6, 8, 16 |