Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12340:2d15f62f4f8a libavcodec
VP8: move zeroing of luma DC block into the WHT
Lets us do the zeroing in asm instead of C.
Also makes it consistent with the way the regular iDCT code does it.
author | darkshikari |
---|---|
date | Mon, 02 Aug 2010 20:18:09 +0000 |
parents | 435319d67bd8 |
children | 4f13b2ded34d |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
11975 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
148 pb_27_63: times 8 db 27, 63 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
149 pb_18_63: times 8 db 18, 63 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
150 pb_9_63: times 8 db 9, 63 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
151 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
152 cextern pb_1 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
153 cextern pw_3 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
154 cextern pb_3 |
11975 | 155 cextern pw_4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
156 cextern pb_4 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
157 cextern pw_9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
158 cextern pw_18 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
159 cextern pw_27 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
160 cextern pw_63 |
11975 | 161 cextern pw_64 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
162 cextern pb_80 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
163 cextern pb_F8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
164 cextern pb_FE |
11975 | 165 |
166 SECTION .text | |
167 | |
168 ;----------------------------------------------------------------------------- | |
169 ; subpel MC functions: | |
170 ; | |
171 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
172 ; uint8_t *src, int srcstride, | |
173 ; int height, int mx, int my); | |
174 ;----------------------------------------------------------------------------- | |
175 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 %macro FILTER_SSSE3 3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 lea r5d, [r5*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 mova m3, [filter_h6_shuf2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 mova m4, [filter_h6_shuf3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 mova m6, [sixtap_filter_hb+r5*8-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 mova m7, [sixtap_filter_hb+r5*8-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 movu m0, [r2-2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 mova m2, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 %ifidn %1, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 ; shuffle with a memory operand |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 punpcklbw m0, [r2+3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 %else |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 pshufb m0, [filter_h6_shuf1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 pshufb m1, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 pshufb m2, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 pmaddubsw m2, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 paddsw m0, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
214 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 shl r5d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 mova m2, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 mova m3, [filter_h2_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 mova m4, [filter_h4_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 mova m6, [fourtap_filter_hb+r5] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 movu m0, [r2-1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 pshufb m0, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 pshufb m1, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
245 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 shl r6d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 mova m5, [fourtap_filter_hb+r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 mova m6, [fourtap_filter_hb+r6] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 mova m7, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 ; read 3 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 movh m1, [r2+ r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 movh m2, [r2+2*r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 movh m3, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 mova m4, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 punpcklbw m4, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 punpcklbw m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 pmaddubsw m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 pmaddubsw m2, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 paddsw m4, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 paddsw m4, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 psraw m4, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 packuswb m4, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 movh [r0], m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
284 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 lea r6d, [r6*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 lea r6, [sixtap_filter_hb+r6*8] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 ; read 5 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 movh m1, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 movh m2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 lea r2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 movh m3, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 movh m4, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 movh m5, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 mova m6, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 punpcklbw m6, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 punpcklbw m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 mova m7, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 punpcklbw m7, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 pmaddubsw m6, [r6-48] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 pmaddubsw m1, [r6-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 pmaddubsw m7, [r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 paddsw m6, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 paddsw m6, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 paddsw m6, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 psraw m6, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 mova m3, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 packuswb m6, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 mova m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 movh [r0], m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
327 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
328 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
329 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
330 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
331 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
332 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
333 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
334 %endmacro |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
335 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
336 INIT_MMX |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
337 FILTER_SSSE3 4, 0, 0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
338 INIT_XMM |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
339 FILTER_SSSE3 8, 8, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
340 |
11975 | 341 ; 4x4 block, H-only 4-tap filter |
342 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
343 shl r5d, 4 | |
344 %ifdef PIC | |
345 lea r11, [fourtap_filter_hw_m] | |
346 %endif | |
347 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
348 movq mm5, [fourtap_filter_hw+r5] | |
349 movq mm7, [pw_64] | |
350 pxor mm6, mm6 | |
351 | |
352 .nextrow | |
353 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
354 | |
355 ; first set of 2 pixels | |
356 movq mm2, mm1 ; byte ABCD.. | |
357 punpcklbw mm1, mm6 ; byte->word ABCD | |
358 pshufw mm0, mm2, 9 ; byte CDEF.. | |
359 punpcklbw mm0, mm6 ; byte->word CDEF | |
360 pshufw mm3, mm1, 0x94 ; word ABBC | |
361 pshufw mm1, mm0, 0x94 ; word CDDE | |
362 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
363 movq mm0, mm1 ; backup for second set of pixels | |
364 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
365 paddd mm3, mm1 ; finish 1st 2px | |
366 | |
367 ; second set of 2 pixels, use backup of above | |
368 punpckhbw mm2, mm6 ; byte->word EFGH | |
369 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
370 pshufw mm1, mm2, 0x94 ; word EFFG | |
371 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
372 paddd mm0, mm1 ; finish 2nd 2px | |
373 | |
374 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
375 packssdw mm3, mm0 ; merge dword->word (4px) | |
376 paddsw mm3, mm7 ; rounding | |
377 psraw mm3, 7 | |
378 packuswb mm3, mm6 ; clip and word->bytes | |
379 movd [r0], mm3 ; store | |
380 | |
381 ; go to next line | |
382 add r0, r1 | |
383 add r2, r3 | |
384 dec r4 ; next row | |
385 jg .nextrow | |
386 REP_RET | |
387 | |
388 ; 4x4 block, H-only 6-tap filter | |
389 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
390 lea r5d, [r5*3] | |
391 %ifdef PIC | |
392 lea r11, [sixtap_filter_hw_m] | |
393 %endif | |
394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
395 movq mm5, [sixtap_filter_hw+r5*8-32] | |
396 movq mm6, [sixtap_filter_hw+r5*8-16] | |
397 movq mm7, [pw_64] | |
398 pxor mm3, mm3 | |
399 | |
400 .nextrow | |
401 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
402 | |
403 ; first set of 2 pixels | |
404 movq mm2, mm1 ; byte ABCD.. | |
405 punpcklbw mm1, mm3 ; byte->word ABCD | |
406 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
407 punpckhbw mm2, mm3 ; byte->word EFGH | |
408 punpcklbw mm0, mm3 ; byte->word CDEF | |
409 pshufw mm1, mm1, 0x94 ; word ABBC | |
410 pshufw mm2, mm2, 0x94 ; word EFFG | |
411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
412 pshufw mm3, mm0, 0x94 ; word CDDE | |
413 movq mm0, mm3 ; backup for second set of pixels | |
414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
415 paddd mm1, mm3 ; add to 1st 2px cache | |
416 movq mm3, mm2 ; backup for second set of pixels | |
417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
418 paddd mm1, mm2 ; finish 1st 2px | |
419 | |
420 ; second set of 2 pixels, use backup of above | |
421 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
424 paddd mm0, mm3 ; add to 2nd 2px cache | |
425 pxor mm3, mm3 | |
426 punpcklbw mm2, mm3 ; byte->word FGHI | |
427 pshufw mm2, mm2, 0xE9 ; word GHHI | |
428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
429 paddd mm0, mm2 ; finish 2nd 2px | |
430 | |
431 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
432 packssdw mm1, mm0 ; merge dword->word (4px) | |
433 paddsw mm1, mm7 ; rounding | |
434 psraw mm1, 7 | |
435 packuswb mm1, mm3 ; clip and word->bytes | |
436 movd [r0], mm1 ; store | |
437 | |
438 ; go to next line | |
439 add r0, r1 | |
440 add r2, r3 | |
441 dec r4 ; next row | |
442 jg .nextrow | |
443 REP_RET | |
444 | |
445 INIT_XMM | |
12278 | 446 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 |
447 shl r5d, 5 | |
11975 | 448 %ifdef PIC |
12278 | 449 lea r11, [fourtap_filter_v_m] |
11975 | 450 %endif |
12278 | 451 lea r5, [fourtap_filter_v+r5-32] |
11975 | 452 pxor m7, m7 |
12278 | 453 mova m4, [pw_64] |
454 mova m5, [r5+ 0] | |
455 mova m6, [r5+16] | |
456 %ifdef m8 | |
457 mova m8, [r5+32] | |
458 mova m9, [r5+48] | |
459 %endif | |
11975 | 460 .nextrow |
12278 | 461 movq m0, [r2-1] |
462 movq m1, [r2-0] | |
463 movq m2, [r2+1] | |
464 movq m3, [r2+2] | |
465 punpcklbw m0, m7 | |
466 punpcklbw m1, m7 | |
467 punpcklbw m2, m7 | |
468 punpcklbw m3, m7 | |
469 pmullw m0, m5 | |
470 pmullw m1, m6 | |
471 %ifdef m8 | |
472 pmullw m2, m8 | |
473 pmullw m3, m9 | |
474 %else | |
475 pmullw m2, [r5+32] | |
476 pmullw m3, [r5+48] | |
477 %endif | |
478 paddsw m0, m1 | |
479 paddsw m2, m3 | |
480 paddsw m0, m2 | |
481 paddsw m0, m4 | |
11975 | 482 psraw m0, 7 |
483 packuswb m0, m7 | |
484 movh [r0], m0 ; store | |
485 | |
486 ; go to next line | |
487 add r0, r1 | |
488 add r2, r3 | |
489 dec r4 ; next row | |
490 jg .nextrow | |
491 REP_RET | |
492 | |
12278 | 493 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 |
11975 | 494 lea r5d, [r5*3] |
12278 | 495 shl r5d, 4 |
11975 | 496 %ifdef PIC |
12278 | 497 lea r11, [sixtap_filter_v_m] |
11975 | 498 %endif |
12278 | 499 lea r5, [sixtap_filter_v+r5-96] |
11975 | 500 pxor m7, m7 |
12278 | 501 mova m6, [pw_64] |
502 %ifdef m8 | |
503 mova m8, [r5+ 0] | |
504 mova m9, [r5+16] | |
505 mova m10, [r5+32] | |
506 mova m11, [r5+48] | |
507 mova m12, [r5+64] | |
508 mova m13, [r5+80] | |
509 %endif | |
11975 | 510 .nextrow |
12278 | 511 movq m0, [r2-2] |
512 movq m1, [r2-1] | |
513 movq m2, [r2-0] | |
514 movq m3, [r2+1] | |
515 movq m4, [r2+2] | |
516 movq m5, [r2+3] | |
517 punpcklbw m0, m7 | |
518 punpcklbw m1, m7 | |
519 punpcklbw m2, m7 | |
520 punpcklbw m3, m7 | |
521 punpcklbw m4, m7 | |
522 punpcklbw m5, m7 | |
523 %ifdef m8 | |
524 pmullw m0, m8 | |
525 pmullw m1, m9 | |
526 pmullw m2, m10 | |
527 pmullw m3, m11 | |
528 pmullw m4, m12 | |
529 pmullw m5, m13 | |
530 %else | |
531 pmullw m0, [r5+ 0] | |
532 pmullw m1, [r5+16] | |
533 pmullw m2, [r5+32] | |
534 pmullw m3, [r5+48] | |
535 pmullw m4, [r5+64] | |
536 pmullw m5, [r5+80] | |
537 %endif | |
538 paddsw m1, m4 | |
539 paddsw m0, m5 | |
540 paddsw m1, m2 | |
541 paddsw m0, m3 | |
542 paddsw m0, m1 | |
543 paddsw m0, m6 | |
11975 | 544 psraw m0, 7 |
545 packuswb m0, m7 | |
546 movh [r0], m0 ; store | |
547 | |
548 ; go to next line | |
549 add r0, r1 | |
550 add r2, r3 | |
551 dec r4 ; next row | |
552 jg .nextrow | |
553 REP_RET | |
554 | |
555 %macro FILTER_V 3 | |
556 ; 4x4 block, V-only 4-tap filter | |
557 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
558 shl r6d, 5 | |
559 %ifdef PIC | |
560 lea r11, [fourtap_filter_v_m] | |
561 %endif | |
562 lea r6, [fourtap_filter_v+r6-32] | |
563 mova m6, [pw_64] | |
564 pxor m7, m7 | |
565 mova m5, [r6+48] | |
566 | |
567 ; read 3 lines | |
568 sub r2, r3 | |
569 movh m0, [r2] | |
570 movh m1, [r2+ r3] | |
571 movh m2, [r2+2*r3] | |
572 add r2, r3 | |
573 punpcklbw m0, m7 | |
574 punpcklbw m1, m7 | |
575 punpcklbw m2, m7 | |
576 | |
577 .nextrow | |
578 ; first calculate negative taps (to prevent losing positive overflows) | |
579 movh m4, [r2+2*r3] ; read new row | |
580 punpcklbw m4, m7 | |
581 mova m3, m4 | |
582 pmullw m0, [r6+0] | |
583 pmullw m4, m5 | |
584 paddsw m4, m0 | |
585 | |
586 ; then calculate positive taps | |
587 mova m0, m1 | |
588 pmullw m1, [r6+16] | |
589 paddsw m4, m1 | |
590 mova m1, m2 | |
591 pmullw m2, [r6+32] | |
592 paddsw m4, m2 | |
593 mova m2, m3 | |
594 | |
595 ; round/clip/store | |
596 paddsw m4, m6 | |
597 psraw m4, 7 | |
598 packuswb m4, m7 | |
599 movh [r0], m4 | |
600 | |
601 ; go to next line | |
602 add r0, r1 | |
603 add r2, r3 | |
604 dec r4 ; next row | |
605 jg .nextrow | |
606 REP_RET | |
607 | |
608 | |
609 ; 4x4 block, V-only 6-tap filter | |
610 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
611 shl r6d, 4 | |
612 lea r6, [r6*3] | |
613 %ifdef PIC | |
614 lea r11, [sixtap_filter_v_m] | |
615 %endif | |
616 lea r6, [sixtap_filter_v+r6-96] | |
617 pxor m7, m7 | |
618 | |
619 ; read 5 lines | |
620 sub r2, r3 | |
621 sub r2, r3 | |
622 movh m0, [r2] | |
623 movh m1, [r2+r3] | |
624 movh m2, [r2+r3*2] | |
625 lea r2, [r2+r3*2] | |
626 add r2, r3 | |
627 movh m3, [r2] | |
628 movh m4, [r2+r3] | |
629 punpcklbw m0, m7 | |
630 punpcklbw m1, m7 | |
631 punpcklbw m2, m7 | |
632 punpcklbw m3, m7 | |
633 punpcklbw m4, m7 | |
634 | |
635 .nextrow | |
636 ; first calculate negative taps (to prevent losing positive overflows) | |
637 mova m5, m1 | |
638 pmullw m5, [r6+16] | |
639 mova m6, m4 | |
640 pmullw m6, [r6+64] | |
641 paddsw m6, m5 | |
642 | |
643 ; then calculate positive taps | |
644 movh m5, [r2+2*r3] ; read new row | |
645 punpcklbw m5, m7 | |
646 pmullw m0, [r6+0] | |
647 paddsw m6, m0 | |
648 mova m0, m1 | |
649 mova m1, m2 | |
650 pmullw m2, [r6+32] | |
651 paddsw m6, m2 | |
652 mova m2, m3 | |
653 pmullw m3, [r6+48] | |
654 paddsw m6, m3 | |
655 mova m3, m4 | |
656 mova m4, m5 | |
657 pmullw m5, [r6+80] | |
658 paddsw m6, m5 | |
659 | |
660 ; round/clip/store | |
661 paddsw m6, [pw_64] | |
662 psraw m6, 7 | |
663 packuswb m6, m7 | |
664 movh [r0], m6 | |
665 | |
666 ; go to next line | |
667 add r0, r1 | |
668 add r2, r3 | |
669 dec r4 ; next row | |
670 jg .nextrow | |
671 REP_RET | |
672 %endmacro | |
673 | |
674 INIT_MMX | |
675 FILTER_V mmxext, 4, 0 | |
676 INIT_XMM | |
677 FILTER_V sse2, 8, 8 | |
678 | |
11991 | 679 %macro FILTER_BILINEAR 3 |
680 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
681 mov r5d, 8*16 | |
682 shl r6d, 4 | |
683 sub r5d, r6d | |
684 %ifdef PIC | |
685 lea r11, [bilinear_filter_vw_m] | |
686 %endif | |
687 pxor m6, m6 | |
12000 | 688 mova m4, [bilinear_filter_vw+r5-16] |
689 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 690 .nextrow |
691 movh m0, [r2+r3*0] | |
692 movh m1, [r2+r3*1] | |
693 movh m3, [r2+r3*2] | |
694 punpcklbw m0, m6 | |
695 punpcklbw m1, m6 | |
696 punpcklbw m3, m6 | |
697 mova m2, m1 | |
698 pmullw m0, m4 | |
699 pmullw m1, m5 | |
700 pmullw m2, m4 | |
701 pmullw m3, m5 | |
702 paddsw m0, m1 | |
703 paddsw m2, m3 | |
704 psraw m0, 2 | |
705 psraw m2, 2 | |
706 pavgw m0, m6 | |
707 pavgw m2, m6 | |
708 %ifidn %1, mmxext | |
709 packuswb m0, m0 | |
710 packuswb m2, m2 | |
711 movh [r0+r1*0], m0 | |
712 movh [r0+r1*1], m2 | |
713 %else | |
714 packuswb m0, m2 | |
715 movh [r0+r1*0], m0 | |
716 movhps [r0+r1*1], m0 | |
717 %endif | |
718 | |
719 lea r0, [r0+r1*2] | |
720 lea r2, [r2+r3*2] | |
721 sub r4, 2 | |
722 jg .nextrow | |
723 REP_RET | |
724 | |
725 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
726 mov r6d, 8*16 | |
727 shl r5d, 4 | |
728 sub r6d, r5d | |
729 %ifdef PIC | |
730 lea r11, [bilinear_filter_vw_m] | |
731 %endif | |
732 pxor m6, m6 | |
12000 | 733 mova m4, [bilinear_filter_vw+r6-16] |
734 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 735 .nextrow |
736 movh m0, [r2+r3*0+0] | |
737 movh m1, [r2+r3*0+1] | |
738 movh m2, [r2+r3*1+0] | |
739 movh m3, [r2+r3*1+1] | |
740 punpcklbw m0, m6 | |
741 punpcklbw m1, m6 | |
742 punpcklbw m2, m6 | |
743 punpcklbw m3, m6 | |
744 pmullw m0, m4 | |
745 pmullw m1, m5 | |
746 pmullw m2, m4 | |
747 pmullw m3, m5 | |
748 paddsw m0, m1 | |
749 paddsw m2, m3 | |
750 psraw m0, 2 | |
751 psraw m2, 2 | |
752 pavgw m0, m6 | |
753 pavgw m2, m6 | |
754 %ifidn %1, mmxext | |
755 packuswb m0, m0 | |
756 packuswb m2, m2 | |
757 movh [r0+r1*0], m0 | |
758 movh [r0+r1*1], m2 | |
759 %else | |
760 packuswb m0, m2 | |
761 movh [r0+r1*0], m0 | |
762 movhps [r0+r1*1], m0 | |
763 %endif | |
764 | |
765 lea r0, [r0+r1*2] | |
766 lea r2, [r2+r3*2] | |
767 sub r4, 2 | |
768 jg .nextrow | |
769 REP_RET | |
770 %endmacro | |
771 | |
772 INIT_MMX | |
773 FILTER_BILINEAR mmxext, 4, 0 | |
774 INIT_XMM | |
775 FILTER_BILINEAR sse2, 8, 7 | |
776 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
777 %macro FILTER_BILINEAR_SSSE3 1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
778 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
11991 | 779 shl r6d, 4 |
780 %ifdef PIC | |
781 lea r11, [bilinear_filter_vb_m] | |
782 %endif | |
783 pxor m4, m4 | |
12000 | 784 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 785 .nextrow |
786 movh m0, [r2+r3*0] | |
787 movh m1, [r2+r3*1] | |
788 movh m2, [r2+r3*2] | |
789 punpcklbw m0, m1 | |
790 punpcklbw m1, m2 | |
791 pmaddubsw m0, m3 | |
792 pmaddubsw m1, m3 | |
793 psraw m0, 2 | |
794 psraw m1, 2 | |
795 pavgw m0, m4 | |
796 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
797 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
798 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
799 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
800 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
801 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
802 %else |
11991 | 803 packuswb m0, m1 |
804 movh [r0+r1*0], m0 | |
805 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
806 %endif |
11991 | 807 |
808 lea r0, [r0+r1*2] | |
809 lea r2, [r2+r3*2] | |
810 sub r4, 2 | |
811 jg .nextrow | |
812 REP_RET | |
813 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
814 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
11991 | 815 shl r5d, 4 |
816 %ifdef PIC | |
817 lea r11, [bilinear_filter_vb_m] | |
818 %endif | |
819 pxor m4, m4 | |
820 mova m2, [filter_h2_shuf] | |
12000 | 821 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 822 .nextrow |
823 movu m0, [r2+r3*0] | |
824 movu m1, [r2+r3*1] | |
825 pshufb m0, m2 | |
826 pshufb m1, m2 | |
827 pmaddubsw m0, m3 | |
828 pmaddubsw m1, m3 | |
829 psraw m0, 2 | |
830 psraw m1, 2 | |
831 pavgw m0, m4 | |
832 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
833 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
834 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
835 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
836 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
837 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
838 %else |
11991 | 839 packuswb m0, m1 |
840 movh [r0+r1*0], m0 | |
841 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
842 %endif |
11991 | 843 |
844 lea r0, [r0+r1*2] | |
845 lea r2, [r2+r3*2] | |
846 sub r4, 2 | |
847 jg .nextrow | |
848 REP_RET | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
849 %endmacro |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
850 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
851 INIT_MMX |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
852 FILTER_BILINEAR_SSSE3 4 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
853 INIT_XMM |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
854 FILTER_BILINEAR_SSSE3 8 |
11991 | 855 |
11992 | 856 cglobal put_vp8_pixels8_mmx, 5,5 |
857 .nextrow: | |
858 movq mm0, [r2+r3*0] | |
859 movq mm1, [r2+r3*1] | |
860 lea r2, [r2+r3*2] | |
861 movq [r0+r1*0], mm0 | |
862 movq [r0+r1*1], mm1 | |
863 lea r0, [r0+r1*2] | |
864 sub r4d, 2 | |
865 jg .nextrow | |
866 REP_RET | |
867 | |
868 cglobal put_vp8_pixels16_mmx, 5,5 | |
869 .nextrow: | |
870 movq mm0, [r2+r3*0+0] | |
871 movq mm1, [r2+r3*0+8] | |
872 movq mm2, [r2+r3*1+0] | |
873 movq mm3, [r2+r3*1+8] | |
874 lea r2, [r2+r3*2] | |
875 movq [r0+r1*0+0], mm0 | |
876 movq [r0+r1*0+8], mm1 | |
877 movq [r0+r1*1+0], mm2 | |
878 movq [r0+r1*1+8], mm3 | |
879 lea r0, [r0+r1*2] | |
880 sub r4d, 2 | |
881 jg .nextrow | |
882 REP_RET | |
883 | |
884 cglobal put_vp8_pixels16_sse, 5,5,2 | |
885 .nextrow: | |
886 movups xmm0, [r2+r3*0] | |
887 movups xmm1, [r2+r3*1] | |
888 lea r2, [r2+r3*2] | |
889 movaps [r0+r1*0], xmm0 | |
890 movaps [r0+r1*1], xmm1 | |
891 lea r0, [r0+r1*2] | |
892 sub r4d, 2 | |
893 jg .nextrow | |
894 REP_RET | |
895 | |
11975 | 896 ;----------------------------------------------------------------------------- |
897 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
898 ;----------------------------------------------------------------------------- | |
899 | |
12238 | 900 %macro ADD_DC 4 |
901 %4 m2, [r0+%3] | |
902 %4 m3, [r0+r2+%3] | |
903 %4 m4, [r1+%3] | |
904 %4 m5, [r1+r2+%3] | |
905 paddusb m2, %1 | |
906 paddusb m3, %1 | |
907 paddusb m4, %1 | |
908 paddusb m5, %1 | |
909 psubusb m2, %2 | |
910 psubusb m3, %2 | |
911 psubusb m4, %2 | |
912 psubusb m5, %2 | |
913 %4 [r0+%3], m2 | |
914 %4 [r0+r2+%3], m3 | |
915 %4 [r1+%3], m4 | |
916 %4 [r1+r2+%3], m5 | |
917 %endmacro | |
918 | |
919 INIT_MMX | |
11975 | 920 cglobal vp8_idct_dc_add_mmx, 3, 3 |
921 ; load data | |
12238 | 922 movd m0, [r1] |
11975 | 923 |
924 ; calculate DC | |
12238 | 925 paddw m0, [pw_4] |
926 pxor m1, m1 | |
927 psraw m0, 3 | |
928 movd [r1], m1 | |
929 psubw m1, m0 | |
930 packuswb m0, m0 | |
931 packuswb m1, m1 | |
932 punpcklbw m0, m0 | |
933 punpcklbw m1, m1 | |
934 punpcklwd m0, m0 | |
935 punpcklwd m1, m1 | |
11975 | 936 |
937 ; add DC | |
12238 | 938 lea r1, [r0+r2*2] |
939 ADD_DC m0, m1, 0, movh | |
11975 | 940 RET |
941 | |
12238 | 942 INIT_XMM |
11975 | 943 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
944 ; load data | |
12238 | 945 movd m0, [r1] |
946 pxor m1, m1 | |
947 | |
948 ; calculate DC | |
949 paddw m0, [pw_4] | |
950 movd [r1], m1 | |
951 lea r1, [r0+r2*2] | |
952 movd m2, [r0] | |
953 movd m3, [r0+r2] | |
954 movd m4, [r1] | |
955 movd m5, [r1+r2] | |
956 psraw m0, 3 | |
957 pshuflw m0, m0, 0 | |
958 punpcklqdq m0, m0 | |
959 punpckldq m2, m3 | |
960 punpckldq m4, m5 | |
961 punpcklbw m2, m1 | |
962 punpcklbw m4, m1 | |
963 paddw m2, m0 | |
964 paddw m4, m0 | |
965 packuswb m2, m4 | |
966 movd [r0], m2 | |
967 pextrd [r0+r2], m2, 1 | |
968 pextrd [r1], m2, 2 | |
969 pextrd [r1+r2], m2, 3 | |
970 RET | |
971 | |
972 ;----------------------------------------------------------------------------- | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
973 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
12238 | 974 ;----------------------------------------------------------------------------- |
975 | |
976 INIT_MMX | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
977 cglobal vp8_idct_dc_add4y_mmx, 3, 3 |
12238 | 978 ; load data |
979 movd m0, [r1+32*0] ; A | |
980 movd m1, [r1+32*2] ; C | |
981 punpcklwd m0, [r1+32*1] ; A B | |
982 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 983 punpckldq m0, m1 ; A B C D |
12238 | 984 pxor m6, m6 |
11975 | 985 |
986 ; calculate DC | |
12238 | 987 paddw m0, [pw_4] |
988 movd [r1+32*0], m6 | |
989 movd [r1+32*1], m6 | |
990 movd [r1+32*2], m6 | |
991 movd [r1+32*3], m6 | |
992 psraw m0, 3 | |
993 psubw m6, m0 | |
994 packuswb m0, m0 | |
995 packuswb m6, m6 | |
996 punpcklbw m0, m0 ; AABBCCDD | |
997 punpcklbw m6, m6 ; AABBCCDD | |
998 movq m1, m0 | |
999 movq m7, m6 | |
1000 punpcklbw m0, m0 ; AAAABBBB | |
1001 punpckhbw m1, m1 ; CCCCDDDD | |
1002 punpcklbw m6, m6 ; AAAABBBB | |
1003 punpckhbw m7, m7 ; CCCCDDDD | |
1004 | |
1005 ; add DC | |
1006 lea r1, [r0+r2*2] | |
1007 ADD_DC m0, m6, 0, mova | |
1008 ADD_DC m1, m7, 8, mova | |
1009 RET | |
1010 | |
1011 INIT_XMM | |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1012 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |
12238 | 1013 ; load data |
1014 movd m0, [r1+32*0] ; A | |
1015 movd m1, [r1+32*2] ; C | |
1016 punpcklwd m0, [r1+32*1] ; A B | |
1017 punpcklwd m1, [r1+32*3] ; C D | |
12239 | 1018 punpckldq m0, m1 ; A B C D |
12238 | 1019 pxor m1, m1 |
1020 | |
1021 ; calculate DC | |
1022 paddw m0, [pw_4] | |
1023 movd [r1+32*0], m1 | |
1024 movd [r1+32*1], m1 | |
1025 movd [r1+32*2], m1 | |
1026 movd [r1+32*3], m1 | |
1027 psraw m0, 3 | |
1028 psubw m1, m0 | |
1029 packuswb m0, m0 | |
1030 packuswb m1, m1 | |
1031 punpcklbw m0, m0 | |
1032 punpcklbw m1, m1 | |
1033 punpcklbw m0, m0 | |
1034 punpcklbw m1, m1 | |
1035 | |
1036 ; add DC | |
1037 lea r1, [r0+r2*2] | |
1038 ADD_DC m0, m1, 0, mova | |
11975 | 1039 RET |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1040 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1041 ;----------------------------------------------------------------------------- |
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1042 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1043 ;----------------------------------------------------------------------------- |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1044 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1045 INIT_MMX |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1046 cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1047 ; load data |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1048 movd m0, [r1+32*0] ; A |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1049 movd m1, [r1+32*2] ; C |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1050 punpcklwd m0, [r1+32*1] ; A B |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1051 punpcklwd m1, [r1+32*3] ; C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1052 punpckldq m0, m1 ; A B C D |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1053 pxor m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1054 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1055 ; calculate DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1056 paddw m0, [pw_4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1057 movd [r1+32*0], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1058 movd [r1+32*1], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1059 movd [r1+32*2], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1060 movd [r1+32*3], m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1061 psraw m0, 3 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1062 psubw m6, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1063 packuswb m0, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1064 packuswb m6, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1065 punpcklbw m0, m0 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1066 punpcklbw m6, m6 ; AABBCCDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1067 movq m1, m0 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1068 movq m7, m6 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1069 punpcklbw m0, m0 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1070 punpckhbw m1, m1 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1071 punpcklbw m6, m6 ; AAAABBBB |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1072 punpckhbw m7, m7 ; CCCCDDDD |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1073 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1074 ; add DC |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1075 lea r1, [r0+r2*2] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1076 ADD_DC m0, m6, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1077 lea r0, [r0+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1078 lea r1, [r1+r2*4] |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1079 ADD_DC m1, m7, 0, mova |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1080 RET |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1081 |
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1082 ;----------------------------------------------------------------------------- |
12013 | 1083 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
1084 ;----------------------------------------------------------------------------- | |
1085 | |
1086 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
1087 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
1088 %macro VP8_MULTIPLY_SUMSUB 4 | |
1089 mova %3, %1 | |
1090 mova %4, %2 | |
1091 pmulhw %3, m6 ;20091(1) | |
1092 pmulhw %4, m6 ;20091(2) | |
1093 paddw %3, %1 | |
1094 paddw %4, %2 | |
12018 | 1095 paddw %1, %1 |
1096 paddw %2, %2 | |
12013 | 1097 pmulhw %1, m7 ;35468(1) |
1098 pmulhw %2, m7 ;35468(2) | |
1099 psubw %1, %4 | |
1100 paddw %2, %3 | |
1101 %endmacro | |
1102 | |
1103 ; calculate x0=%1+%3; x1=%1-%3 | |
1104 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
1105 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
1106 ; %5/%6 are temporary registers | |
1107 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
1108 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
1109 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
1110 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
1111 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
1112 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
1113 SWAP %4, %1 | |
1114 SWAP %4, %3 | |
1115 %endmacro | |
1116 | |
1117 INIT_MMX | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1118 %macro VP8_IDCT_ADD 1 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1119 cglobal vp8_idct_add_%1, 3, 3 |
12013 | 1120 ; load block data |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1121 movq m0, [r1+ 0] |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1122 movq m1, [r1+ 8] |
12013 | 1123 movq m2, [r1+16] |
1124 movq m3, [r1+24] | |
1125 movq m6, [pw_20091] | |
1126 movq m7, [pw_17734] | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1127 %ifidn %1, sse |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1128 xorps xmm0, xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1129 movaps [r1+ 0], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1130 movaps [r1+16], xmm0 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1131 %else |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1132 pxor m4, m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1133 movq [r1+ 0], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1134 movq [r1+ 8], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1135 movq [r1+16], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1136 movq [r1+24], m4 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1137 %endif |
12013 | 1138 |
1139 ; actual IDCT | |
1140 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1141 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1142 paddw m0, [pw_4] | |
1143 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1144 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1145 | |
1146 ; store | |
1147 pxor m4, m4 | |
1148 lea r1, [r0+2*r2] | |
1149 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1150 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1151 | |
1152 RET | |
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1153 %endmacro |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1154 |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1155 VP8_IDCT_ADD mmx |
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1156 VP8_IDCT_ADD sse |
12013 | 1157 |
1158 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1159 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1160 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1161 |
12209 | 1162 %macro SCATTER_WHT 3 |
1163 movd r1d, m%1 | |
1164 movd r2d, m%2 | |
1165 mov [r0+2*16*(0+%3)], r1w | |
1166 mov [r0+2*16*(1+%3)], r2w | |
1167 shr r1d, 16 | |
1168 shr r2d, 16 | |
1169 psrlq m%1, 32 | |
1170 psrlq m%2, 32 | |
1171 mov [r0+2*16*(4+%3)], r1w | |
1172 mov [r0+2*16*(5+%3)], r2w | |
1173 movd r1d, m%1 | |
1174 movd r2d, m%2 | |
1175 mov [r0+2*16*(8+%3)], r1w | |
1176 mov [r0+2*16*(9+%3)], r2w | |
1177 shr r1d, 16 | |
1178 shr r2d, 16 | |
1179 mov [r0+2*16*(12+%3)], r1w | |
1180 mov [r0+2*16*(13+%3)], r2w | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1181 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1182 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1183 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1184 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1185 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1186 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1187 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1188 |
12340
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1189 %macro VP8_DC_WHT 1 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1190 cglobal vp8_luma_dc_wht_%1, 2,3 |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1191 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1192 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1193 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1194 movq m3, [r1+24] |
12340
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1195 %ifidn %1, sse |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1196 xorps xmm0, xmm0 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1197 movaps [r1+ 0], xmm0 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1198 movaps [r1+16], xmm0 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1199 %else |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1200 pxor m4, m4 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1201 movq [r1+ 0], m4 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1202 movq [r1+ 8], m4 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1203 movq [r1+16], m4 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1204 movq [r1+24], m4 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1205 %endif |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1206 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1207 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1208 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1209 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1210 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1211 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1212 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1213 psraw m3, 3 |
12209 | 1214 SCATTER_WHT 0, 1, 0 |
1215 SCATTER_WHT 2, 3, 2 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1216 RET |
12340
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1217 %endmacro |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1218 |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1219 INIT_MMX |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1220 VP8_DC_WHT mmx |
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1221 VP8_DC_WHT sse |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1222 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1223 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1224 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1225 ;----------------------------------------------------------------------------- |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1226 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1227 ; macro called with 7 mm register indexes as argument, and 4 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1228 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1229 ; first 4 mm registers will carry the transposed pixel data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1230 ; the other three are scratchspace (one would be sufficient, but this allows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1231 ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1232 ; |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1233 ; first two regular registers are buf+4*stride and buf+5*stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1234 ; third is -stride, fourth is +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1235 %macro READ_8x4_INTERLEAVED 11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1236 ; interleave 8 (A-H) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1237 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1238 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1239 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1240 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1241 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1242 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1243 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1244 punpcklbw m%1, m%5 ; A/B interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1245 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1246 punpcklbw m%2, m%6 ; C/D interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1247 punpcklbw m%3, m%7 ; E/F interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1248 punpcklbw m%4, m%5 ; G/H interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1249 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1250 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1251 ; macro called with 7 mm register indexes as argument, and 5 regular registers |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1252 ; first 11 mean the same as READ_8x4_TRANSPOSED above |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1253 ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1254 ; will be set to second regular register + 8*stride at the end |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1255 %macro READ_16x4_INTERLEAVED 12 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1256 ; transpose 16 (A-P) rows of 4 pixels each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1257 lea %12, [r0+8*r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1258 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1259 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1260 movd m%1, [%8+%10*4] ; A0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1261 movd m%3, [%12+%10*4] ; I0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1262 movd m%2, [%8+%10*2] ; C0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1263 movd m%4, [%12+%10*2] ; K0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1264 movd m%6, [%8+%10] ; D0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1265 movd m%5, [%12+%10] ; L0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1266 movd m%7, [%12] ; M0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1267 add %12, %11 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1268 punpcklbw m%1, m%3 ; A/I |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1269 movd m%3, [%8] ; E0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1270 punpcklbw m%2, m%4 ; C/K |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1271 punpcklbw m%6, m%5 ; D/L |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1272 punpcklbw m%3, m%7 ; E/M |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1273 punpcklbw m%2, m%6 ; C/D/K/L interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1274 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1275 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1276 movd m%5, [%9+%10*4] ; B0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1277 movd m%4, [%12+%10*4] ; J0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1278 movd m%7, [%9] ; F0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1279 movd m%6, [%12] ; N0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1280 punpcklbw m%5, m%4 ; B/J |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1281 punpcklbw m%7, m%6 ; F/N |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1282 punpcklbw m%1, m%5 ; A/B/I/J interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1283 punpcklbw m%3, m%7 ; E/F/M/N interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1284 movd m%4, [%9+%11] ; G0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1285 movd m%6, [%12+%11] ; O0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1286 movd m%5, [%9+%11*2] ; H0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1287 movd m%7, [%12+%11*2] ; P0-3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1288 punpcklbw m%4, m%6 ; G/O |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1289 punpcklbw m%5, m%7 ; H/P |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1290 punpcklbw m%4, m%5 ; G/H/O/P interleaved |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1291 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1292 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1293 ; write 4 mm registers of 2 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1294 ; first four arguments are mm register indexes containing source data |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1295 ; last four are registers containing buf+4*stride, buf+5*stride, |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1296 ; -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1297 %macro WRITE_4x2D 8 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1298 ; write out (2 dwords per register) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1299 movd [%5+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1300 movd [%5+%7*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1301 movd [%5], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1302 movd [%6+%8], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1303 punpckhdq m%1, m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1304 punpckhdq m%2, m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1305 punpckhdq m%3, m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1306 punpckhdq m%4, m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1307 movd [%6+%7*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1308 movd [%5+%7], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1309 movd [%6], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1310 movd [%6+%8*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1311 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1312 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1313 ; write 4 xmm registers of 4 dwords each |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1314 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1315 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1316 ; we add 1*stride to the third regular registry in the process |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1317 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1318 ; same memory region), or 8 if they cover two separate buffers (third one points to |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1319 ; a different memory region than the first two), allowing for more optimal code for |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1320 ; the 16-width case |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1321 %macro WRITE_4x4D 10 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1322 ; write out (4 dwords per register), start with dwords zero |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1323 movd [%5+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1324 movd [%5], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1325 movd [%7+%8*4], m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1326 movd [%7], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1327 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1328 ; store dwords 1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1329 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1330 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1331 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1332 psrldq m%4, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1333 movd [%6+%8*4], m%1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1334 movd [%6], m%2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1335 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1336 movd [%6+%9*4], m%3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1337 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1338 movd [%7+%9], m%4 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1339 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1340 ; write dwords 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1341 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1342 psrldq m%2, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1343 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1344 movd [%5+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1345 movd %5, m%3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1346 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1347 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1348 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1349 %if %10 == 16 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1350 movd [%5+%8*2], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1351 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1352 movd [%6+%9], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1353 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1354 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1355 add %7, %9 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1356 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1357 ; store dwords 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1358 psrldq m%1, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1359 psrldq m%2, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1360 psrldq m%3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1361 psrldq m%4, 4 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1362 %if %10 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1363 mov [%7+%8*4], %5d |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1364 movd [%6+%8*2], m%1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1365 %else |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1366 movd [%5+%8], m%1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1367 %endif |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1368 movd [%6+%9*2], m%2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1369 movd [%7+%8*2], m%3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1370 movd [%7+%9*2], m%4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1371 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1372 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1373 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1374 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1375 ; for pre-SSE4: |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1376 ; 3 is a general-purpose register that we will clobber |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1377 ; for SSE4: |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1378 ; 3 is a pointer to the destination's 5th line |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1379 ; 4 is a pointer to the destination's 4th line |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1380 ; 5/6 is -stride and +stride |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1381 %macro WRITE_2x4W 6 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1382 movd %3, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1383 punpckhdq %1, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1384 mov [%4+%5*4], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1385 shr %3, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1386 add %4, %6 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1387 mov [%4+%5*4], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1388 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1389 movd %3, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1390 add %4, %5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1391 mov [%4+%5*2], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1392 shr %3, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1393 mov [%4+%5 ], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1394 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1395 movd %3, %2 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1396 punpckhdq %2, %2 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1397 mov [%4 ], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1398 shr %3, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1399 mov [%4+%6 ], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1400 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1401 movd %3, %2 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1402 add %4, %6 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1403 mov [%4+%6 ], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1404 shr %3, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1405 mov [%4+%6*2], %3w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1406 add %4, %5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1407 %endmacro |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1408 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1409 %macro WRITE_8W_SSE2 5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1410 movd %2, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1411 psrldq %1, 4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1412 mov [%3+%4*4], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1413 shr %2, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1414 add %3, %5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1415 mov [%3+%4*4], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1416 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1417 movd %2, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1418 psrldq %1, 4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1419 add %3, %4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1420 mov [%3+%4*2], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1421 shr %2, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1422 mov [%3+%4 ], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1423 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1424 movd %2, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1425 psrldq %1, 4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1426 mov [%3 ], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1427 shr %2, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1428 mov [%3+%5 ], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1429 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1430 movd %2, %1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1431 add %3, %5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1432 mov [%3+%5 ], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1433 shr %2, 16 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1434 mov [%3+%5*2], %2w |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1435 %endmacro |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1436 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1437 %macro WRITE_8W_SSE4 5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1438 pextrw [%3+%4*4], %1, 0 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1439 pextrw [%2+%4*4], %1, 1 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1440 pextrw [%3+%4*2], %1, 2 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1441 pextrw [%3+%4 ], %1, 3 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1442 pextrw [%3 ], %1, 4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1443 pextrw [%2 ], %1, 5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1444 pextrw [%2+%5 ], %1, 6 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1445 pextrw [%2+%5*2], %1, 7 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1446 %endmacro |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1447 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1448 %macro SPLATB_REG_MMX 2-3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1449 movd %1, %2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1450 punpcklbw %1, %1 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1451 punpcklwd %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1452 punpckldq %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1453 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1454 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1455 %macro SPLATB_REG_MMXEXT 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1456 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1457 punpcklbw %1, %1 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1458 pshufw %1, %1, 0x0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1459 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1460 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1461 %macro SPLATB_REG_SSE2 2-3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1462 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1463 punpcklbw %1, %1 |
12210 | 1464 pshuflw %1, %1, 0x0 |
1465 punpcklqdq %1, %1 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1466 %endmacro |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1467 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1468 %macro SPLATB_REG_SSSE3 3 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1469 movd %1, %2 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1470 pshufb %1, %3 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1471 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1472 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1473 %macro SIMPLE_LOOPFILTER 3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1474 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1475 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1476 mov r3, 2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1477 %endif |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1478 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1479 %if mmsize == 16 |
12210 | 1480 pxor m0, m0 |
1481 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1482 %endif |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1484 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1485 ; set up indexes to address 4 rows |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1486 mov r2, r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1487 neg r1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1488 %ifidn %2, h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1489 lea r0, [r0+4*r2-2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1490 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1491 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1492 %if mmsize == 8 ; mmx / mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1493 .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1494 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1495 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1496 ; read 4 half/full rows of pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1497 mova m0, [r0+r1*2] ; p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1498 mova m1, [r0+r1] ; p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1499 mova m2, [r0] ; q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1500 mova m3, [r0+r2] ; q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1501 %else ; h |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1502 lea r4, [r0+r2] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1503 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1504 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1505 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1506 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1507 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1508 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1509 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1510 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1511 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1512 ; simple_limit |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1513 mova m5, m2 ; m5=backup of q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1514 mova m6, m1 ; m6=backup of p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1515 psubusb m1, m2 ; p0-q0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1516 psubusb m2, m6 ; q0-p0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1517 por m1, m2 ; FFABS(p0-q0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1518 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1519 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1520 mova m4, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1521 mova m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1522 psubusb m3, m0 ; q1-p1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1523 psubusb m0, m4 ; p1-q1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1524 por m3, m0 ; FFABS(p1-q1) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1525 mova m0, [pb_80] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1526 pxor m2, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1527 pxor m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1528 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1529 pand m3, [pb_FE] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1530 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1531 paddusb m3, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1532 psubusb m3, m7 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1533 pxor m1, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1534 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1535 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1536 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1537 mova m4, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1538 pxor m5, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1539 pxor m0, m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1540 psubsb m5, m0 ; q0-p0 (signed) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1541 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1542 paddsb m2, m5 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1543 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1544 pand m2, m3 ; apply filter mask (m3) |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1545 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1546 mova m3, [pb_F8] |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1547 mova m1, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1548 paddsb m2, [pb_4] ; f1<<3=a+4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1549 paddsb m1, [pb_3] ; f2<<3=a+3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1550 pand m2, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1551 pand m1, m3 ; cache f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1552 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1553 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1554 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1555 pcmpgtb m0, m2 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1556 psubb m3, m2 ; -f1<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1557 psrlq m2, 3 ; +f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1558 psrlq m3, 3 ; -f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1559 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1560 pandn m0, m2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1561 psubusb m4, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1562 paddusb m4, m3 ; q0-f1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1563 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1564 pxor m0, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1565 pxor m3, m3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1566 pcmpgtb m0, m1 ; which values are <0? |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1567 psubb m3, m1 ; -f2<<3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1568 psrlq m1, 3 ; +f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1569 psrlq m3, 3 ; -f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1570 pand m3, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1571 pandn m0, m1 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1572 paddusb m6, m0 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1573 psubusb m6, m3 ; p0+f2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1574 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1575 ; store |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1576 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1577 mova [r0], m4 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1578 mova [r0+r1], m6 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1579 %else ; h |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1580 inc r0 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1581 SBUTTERFLY bw, 6, 4, 0 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1582 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1583 %if mmsize == 16 ; sse2 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1584 %ifidn %1, sse4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1585 inc r4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1586 %endif |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1587 WRITE_8W m6, r4, r0, r1, r2 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1588 lea r4, [r3+r1+1] |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1589 %ifidn %1, sse4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1590 inc r3 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1591 %endif |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1592 WRITE_8W m4, r3, r4, r1, r2 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1593 %else ; mmx/mmxext |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1594 WRITE_2x4W m6, m4, r4, r0, r1, r2 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1595 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1596 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1597 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1598 %if mmsize == 8 ; mmx/mmxext |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1599 ; next 8 pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1600 %ifidn %2, v |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1601 add r0, 8 ; advance 8 cols = pixels |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1602 %else ; h |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1603 lea r0, [r0+r2*8-1] ; advance 8 rows = lines |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1604 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1605 dec r3 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1606 jg .next8px |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1607 REP_RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1608 %else ; sse2 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1609 RET |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1610 %endif |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1611 %endmacro |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1612 |
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1613 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1614 %define SPLATB_REG SPLATB_REG_MMX |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1615 SIMPLE_LOOPFILTER mmx, v, 4 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1616 SIMPLE_LOOPFILTER mmx, h, 5 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1617 %define SPLATB_REG SPLATB_REG_MMXEXT |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1618 SIMPLE_LOOPFILTER mmxext, v, 4 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1619 SIMPLE_LOOPFILTER mmxext, h, 5 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1620 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1621 %define SPLATB_REG SPLATB_REG_SSE2 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1622 %define WRITE_8W WRITE_8W_SSE2 |
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1623 SIMPLE_LOOPFILTER sse2, v, 3 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1624 SIMPLE_LOOPFILTER sse2, h, 5 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1625 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 1626 SIMPLE_LOOPFILTER ssse3, v, 3 |
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1627 SIMPLE_LOOPFILTER ssse3, h, 5 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1628 %define WRITE_8W WRITE_8W_SSE4 |
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1629 SIMPLE_LOOPFILTER sse4, h, 5 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1630 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1631 ;----------------------------------------------------------------------------- |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1632 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1633 ; int flimE, int flimI, int hev_thr); |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1634 ;----------------------------------------------------------------------------- |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1635 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1636 %macro INNER_LOOPFILTER 5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1637 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1638 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1639 %define dst8_reg r1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1640 %define mstride_reg r2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1641 %define E_reg r3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1642 %define I_reg r4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1643 %define hev_thr_reg r5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1644 %else ; luma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1645 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1646 %define mstride_reg r1 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1647 %define E_reg r2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1648 %define I_reg r3 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1649 %define hev_thr_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1650 %ifdef m8 ; x86-64, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1651 %define dst8_reg r4 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1652 %elif mmsize == 16 ; x86-32, sse2 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1653 %define dst8_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1654 %else ; x86-32, mmx/mmxext |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1655 %define cnt_reg r5 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1656 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1657 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1658 %define dst_reg r0 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1659 %define stride_reg E_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1660 %define dst2_reg I_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1661 %ifndef m8 |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1662 %define stack_reg hev_thr_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1663 %endif |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1664 |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1665 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1666 %if mmsize == 16 |
12210 | 1667 pxor m7, m7 |
1668 %endif | |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1669 %endif |
12210 | 1670 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1671 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1672 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1673 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1674 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1675 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1676 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1677 ; align stack |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1678 mov stack_reg, rsp ; backup stack pointer |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1679 and rsp, ~(mmsize-1) ; align stack |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1680 %ifidn %2, v |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1681 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1682 ; [3]=hev() result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1683 %else ; h |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1684 sub rsp, mmsize * 5 ; extra storage space for transposes |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1685 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1686 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1687 %define flim_E [rsp] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1688 %define flim_I [rsp+mmsize] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1689 %define hev_thr [rsp+mmsize*2] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1690 %define mask_res [rsp+mmsize*3] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1691 %define p0backup [rsp+mmsize*3] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1692 %define q0backup [rsp+mmsize*4] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1693 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1694 mova flim_E, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1695 mova flim_I, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1696 mova hev_thr, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1697 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1698 %else ; sse2 on x86-64 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1699 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1700 %define flim_E m9 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1701 %define flim_I m10 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1702 %define hev_thr m11 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1703 %define mask_res m12 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1704 %define p0backup m12 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1705 %define q0backup m8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1706 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1707 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1708 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1709 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1710 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1711 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1712 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1713 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1714 mov cnt_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1715 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1716 mov stride_reg, mstride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1717 neg mstride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1718 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1719 lea dst_reg, [dst_reg + stride_reg*4-4] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1720 %if %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1721 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1722 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1723 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1724 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1725 %if mmsize == 8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1726 .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1727 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1728 ; read |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1729 lea dst2_reg, [dst_reg + stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1730 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1731 %if %4 == 8 && mmsize == 16 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1732 %define movrow movh |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1733 %else |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1734 %define movrow mova |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1735 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1736 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1737 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1738 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1739 movrow m5, [dst2_reg] ; q1 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1740 movrow m6, [dst2_reg+ stride_reg] ; q2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1741 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1742 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1743 movhps m0, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1744 movhps m2, [dst8_reg+mstride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1745 add dst8_reg, stride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1746 movhps m1, [dst8_reg+mstride_reg*4] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1747 movhps m5, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1748 movhps m6, [dst8_reg+ stride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1749 movhps m7, [dst8_reg+ stride_reg*2] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1750 add dst8_reg, mstride_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1751 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1752 %elif mmsize == 8 ; mmx/mmxext (h) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1753 ; read 8 rows of 8px each |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1754 movu m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1755 movu m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1756 movu m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1757 movu m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1758 movu m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1759 movu m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1760 movu m6, [dst2_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1761 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1762 ; 8x8 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1763 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1764 mova q0backup, m1 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1765 movu m7, [dst2_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1766 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1767 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1768 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1769 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1770 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1771 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1772 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1773 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1774 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1775 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1776 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1777 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1778 %else ; sse2 (h) |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1779 %if %4 == 16 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1780 lea dst8_reg, [dst_reg + stride_reg*8] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1781 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1782 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1783 ; read 16 rows of 8px each, interleave |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1784 movh m0, [dst_reg +mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1785 movh m1, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1786 movh m2, [dst_reg +mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1787 movh m5, [dst8_reg+mstride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1788 movh m3, [dst_reg +mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1789 movh m6, [dst8_reg+mstride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1790 movh m4, [dst_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1791 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1792 punpcklbw m0, m1 ; A/I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1793 punpcklbw m2, m5 ; C/K |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1794 punpcklbw m3, m6 ; D/L |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1795 punpcklbw m4, m7 ; E/M |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1796 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1797 add dst8_reg, stride_reg |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1798 movh m1, [dst2_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1799 movh m6, [dst8_reg+mstride_reg*4] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1800 movh m5, [dst2_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1801 movh m7, [dst8_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1802 punpcklbw m1, m6 ; B/J |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1803 punpcklbw m5, m7 ; F/N |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1804 movh m6, [dst2_reg+ stride_reg] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1805 movh m7, [dst8_reg+ stride_reg] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1806 punpcklbw m6, m7 ; G/O |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1807 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1808 ; 8x16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1809 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1810 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1811 SWAP 1, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1812 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1813 mova q0backup, m1 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1814 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1815 movh m7, [dst2_reg+ stride_reg*2] |
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1816 movh m1, [dst8_reg+ stride_reg*2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1817 punpcklbw m7, m1 ; H/P |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1818 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1819 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1820 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1821 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1822 %ifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1823 SWAP 1, 8 |
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1824 SWAP 2, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1825 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1826 mova m1, q0backup |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1827 mova q0backup, m2 ; store q0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1828 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1829 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1830 %ifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1831 SWAP 5, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1832 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1833 mova p0backup, m5 ; store p0 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1834 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1835 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1836 SWAP 2, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1837 SWAP 6, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1838 SWAP 5, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1839 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1840 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1841 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1842 mova m4, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1843 SWAP 4, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1844 psubusb m4, m0 ; p2-p3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1845 psubusb m0, m1 ; p3-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1846 por m0, m4 ; abs(p3-p2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1847 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1848 mova m4, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1849 SWAP 4, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1850 psubusb m4, m1 ; p1-p2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1851 psubusb m1, m2 ; p2-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1852 por m1, m4 ; abs(p2-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1853 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1854 mova m4, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1855 SWAP 4, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1856 psubusb m4, m7 ; q2-q3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1857 psubusb m7, m6 ; q3-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1858 por m7, m4 ; abs(q3-q2) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1859 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1860 mova m4, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1861 SWAP 4, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1862 psubusb m4, m6 ; q1-q2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1863 psubusb m6, m5 ; q2-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1864 por m6, m4 ; abs(q2-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1865 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1866 %ifidn %1, mmx |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1867 mova m4, flim_I |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1868 pxor m3, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1869 psubusb m0, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1870 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1871 psubusb m7, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1872 psubusb m6, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1873 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1874 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1875 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1876 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1877 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1878 pand m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1879 pand m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1880 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1881 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1882 pmaxub m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1883 pmaxub m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1884 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1885 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1886 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1887 SWAP 7, 3 ; now m7 is zero |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1888 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1889 movrow m3, [dst_reg +mstride_reg] ; p0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1890 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1891 movhps m3, [dst8_reg+mstride_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1892 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1893 %elifdef m12 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1894 SWAP 3, 12 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1895 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1896 mova m3, p0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1897 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1898 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1899 mova m1, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1900 SWAP 1, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1901 mova m6, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1902 SWAP 3, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1903 psubusb m1, m3 ; p1-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1904 psubusb m6, m2 ; p0-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1905 por m1, m6 ; abs(p1-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1906 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1907 mova m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1908 psubusb m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1909 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1910 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1911 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1912 pand m0, m1 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1913 mova mask_res, m6 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1914 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1915 pmaxub m0, m1 ; max_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1916 SWAP 1, 4 ; max_hev_thresh |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1917 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1918 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1919 SWAP 6, 4 ; now m6 is I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1920 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1921 movrow m4, [dst_reg] ; q0 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1922 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1923 movhps m4, [dst8_reg] |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1924 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1925 %elifdef m8 |
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1926 SWAP 4, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1927 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1928 mova m4, q0backup |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1929 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1930 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1931 SWAP 1, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1932 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1933 SWAP 7, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1934 psubusb m1, m5 ; q0-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1935 psubusb m7, m4 ; q1-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1936 por m1, m7 ; abs(q1-q0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1937 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1938 mova m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1939 psubusb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1940 psubusb m7, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1941 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1942 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1943 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1944 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1945 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1946 pand m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1947 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1948 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1949 pmaxub m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1950 pmaxub m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1951 psubusb m0, flim_I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1952 psubusb m6, hev_thr |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1953 pcmpeqb m0, m7 ; max(abs(..)) <= I |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1954 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1955 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1956 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1957 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1958 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1959 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1960 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1961 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1962 ; simple_limit |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1963 mova m1, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1964 SWAP 1, 3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1965 mova m6, m4 ; keep copies of p0/q0 around for later use |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1966 SWAP 6, 4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1967 psubusb m1, m4 ; p0-q0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1968 psubusb m6, m3 ; q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1969 por m1, m6 ; abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1970 paddusb m1, m1 ; m1=2*abs(q0-p0) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1971 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1972 mova m7, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1973 SWAP 7, 2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1974 mova m6, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1975 SWAP 6, 5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1976 psubusb m7, m5 ; p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1977 psubusb m6, m2 ; q1-p1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1978 por m7, m6 ; abs(q1-p1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1979 pxor m6, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1980 pand m7, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1981 psrlq m7, 1 ; abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1982 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1983 psubusb m7, flim_E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1984 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1985 pand m0, m7 ; normal_limit result |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1986 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1987 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1988 %ifdef m8 ; x86-64 && sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1989 mova m8, [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1990 %define pb_80_var m8 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1991 %else ; x86-32 or mmx/mmxext |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1992 %define pb_80_var [pb_80] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1993 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1994 mova m1, m4 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1995 mova m7, m3 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1996 pxor m1, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1997 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1998 psubsb m1, m7 ; (signed) q0-p0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1999 mova m6, m2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2000 mova m7, m5 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2001 pxor m6, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2002 pxor m7, pb_80_var |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2003 psubsb m6, m7 ; (signed) p1-q1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2004 mova m7, mask_res |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2005 pandn m7, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2006 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2007 paddsb m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2008 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2009 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2010 pand m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2011 mova m1, [pb_F8] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2012 mova m6, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2013 paddsb m7, [pb_3] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2014 paddsb m6, [pb_4] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2015 pand m7, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2016 pand m6, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2017 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2018 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2019 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2020 pcmpgtb m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2021 psubb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2022 psrlq m7, 3 ; +f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2023 psrlq m0, 3 ; -f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2024 pand m0, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2025 pandn m1, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2026 psubusb m3, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2027 paddusb m3, m1 ; p0+f2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2028 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2029 pxor m1, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2030 pxor m0, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2031 pcmpgtb m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2032 psubb m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2033 psrlq m6, 3 ; +f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2034 psrlq m1, 3 ; -f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2035 pand m1, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2036 pandn m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2037 psubusb m4, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2038 paddusb m4, m1 ; q0-f1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2039 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2040 %ifdef m12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2041 SWAP 6, 12 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2042 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2043 mova m6, mask_res |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2044 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2045 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2046 mova m7, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2047 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2048 pxor m7, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2049 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2050 pand m0, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2051 pand m1, m6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2052 %ifidn %1, mmx |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2053 paddusb m0, m7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2054 pand m1, [pb_FE] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2055 pandn m7, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2056 psrlq m1, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2057 psrlq m7, 1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2058 SWAP 0, 7 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2059 %else ; mmxext/sse2 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2060 psubusb m1, [pb_1] |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2061 pavgb m0, m7 ; a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2062 pavgb m1, m7 ; -a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2063 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2064 psubusb m5, m0 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2065 psubusb m2, m1 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2066 paddusb m5, m1 ; q1-a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2067 paddusb m2, m0 ; p1+a |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2068 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2069 ; store |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2070 %ifidn %2, v |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2071 movrow [dst_reg +mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2072 movrow [dst_reg +mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2073 movrow [dst_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2074 movrow [dst_reg + stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2075 %if mmsize == 16 && %4 == 8 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2076 movhps [dst8_reg+mstride_reg*2], m2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2077 movhps [dst8_reg+mstride_reg ], m3 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2078 movhps [dst8_reg], m4 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2079 movhps [dst8_reg+ stride_reg ], m5 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2080 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2081 %else ; h |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2082 add dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2083 add dst2_reg, 2 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2084 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2085 ; 4x8/16 transpose |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2086 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2087 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2088 %if mmsize == 8 ; mmx/mmxext (h) |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2089 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2090 %else ; sse2 (h) |
12180 | 2091 lea dst8_reg, [dst8_reg+mstride_reg+2] |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2092 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2093 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2094 %endif |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2095 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2096 %if mmsize == 8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2097 %if %4 == 8 ; chroma |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2098 %ifidn %2, h |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2099 sub dst_reg, 2 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2100 %endif |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2101 cmp dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2102 mov dst_reg, dst8_reg |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2103 jnz .next8px |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2104 %else |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2105 %ifidn %2, h |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2106 lea dst_reg, [dst_reg + stride_reg*8-2] |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2107 %else ; v |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2108 add dst_reg, 8 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2109 %endif |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2110 dec cnt_reg |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2111 jg .next8px |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2112 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2113 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2114 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2115 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2116 mov rsp, stack_reg ; restore stack pointer |
12173
c47ddb7df424
Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents:
12168
diff
changeset
|
2117 %endif |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2118 RET |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2119 %endmacro |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2120 |
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2121 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2122 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2123 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
2124 INNER_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2125 INNER_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2126 INNER_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2127 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2128 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2129 INNER_LOOPFILTER mmxext, v, 6, 16, 0 |
2130 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | |
2131 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | |
2132 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2133 |
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2134 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2135 %define SPLATB_REG SPLATB_REG_SSE2 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2136 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2137 %ifdef m8 |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2138 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2139 %else |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2140 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2141 %endif |
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2142 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2143 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2144 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2145 %define SPLATB_REG SPLATB_REG_SSSE3 |
12210 | 2146 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
2147 %ifdef m8 | |
2148 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | |
2149 %else | |
2150 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | |
2151 %endif | |
2152 INNER_LOOPFILTER ssse3, v, 6, 8, 13 | |
2153 INNER_LOOPFILTER ssse3, h, 6, 8, 13 | |
2154 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2155 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2156 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2157 ; int flimE, int flimI, int hev_thr); |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2158 ;----------------------------------------------------------------------------- |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2159 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2160 %macro MBEDGE_LOOPFILTER 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2161 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2162 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2163 %define dst8_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2164 %define mstride_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2165 %define E_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2166 %define I_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2167 %define hev_thr_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2168 %else ; luma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2169 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2170 %define mstride_reg r1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2171 %define E_reg r2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2172 %define I_reg r3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2173 %define hev_thr_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2174 %ifdef m8 ; x86-64, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2175 %define dst8_reg r4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2176 %elif mmsize == 16 ; x86-32, sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2177 %define dst8_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2178 %else ; x86-32, mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2179 %define cnt_reg r5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2180 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2181 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2182 %define dst_reg r0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2183 %define stride_reg E_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2184 %define dst2_reg I_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2185 %ifndef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2186 %define stack_reg hev_thr_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2187 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2188 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2189 %define ssse3_or_higher 0 |
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2190 %ifnidn %1, sse2 |
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2191 %if mmsize == 16 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2192 %define ssse3_or_higher 1 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2193 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2194 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2195 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2196 %if ssse3_or_higher |
12210 | 2197 pxor m7, m7 |
2198 %endif | |
2199 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2200 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2201 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2202 SPLATB_REG m0, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2203 SPLATB_REG m1, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2204 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2205 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2206 ; align stack |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2207 mov stack_reg, rsp ; backup stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2208 and rsp, ~(mmsize-1) ; align stack |
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2209 %if mmsize == 16 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2210 sub rsp, mmsize * 7 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2211 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2212 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2213 ; [3]=hev() result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2214 ; [4]=filter tmp result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2215 ; [5]/[6] = p2/q2 backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2216 ; [7]=lim_res sign result |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2217 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2218 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2219 %define flim_E [rsp] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2220 %define flim_I [rsp+mmsize] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2221 %define hev_thr [rsp+mmsize*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2222 %define mask_res [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2223 %define lim_res [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2224 %define p0backup [rsp+mmsize*3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2225 %define q0backup [rsp+mmsize*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2226 %define p2backup [rsp+mmsize*5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2227 %define q2backup [rsp+mmsize*6] |
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2228 %if mmsize == 16 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2229 %define lim_sign [rsp] |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2230 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2231 %define lim_sign [rsp+mmsize*7] |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2232 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2233 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2234 mova flim_E, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2235 mova flim_I, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2236 mova hev_thr, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2237 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2238 %else ; sse2 on x86-64 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2239 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2240 %define flim_E m9 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2241 %define flim_I m10 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2242 %define hev_thr m11 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2243 %define mask_res m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2244 %define lim_res m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2245 %define p0backup m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2246 %define q0backup m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2247 %define p2backup m13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2248 %define q2backup m14 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2249 %define lim_sign m9 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2250 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2251 ; splat function arguments |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2252 SPLATB_REG flim_E, E_reg, m7 ; E |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2253 SPLATB_REG flim_I, I_reg, m7 ; I |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2254 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2255 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2256 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2257 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2258 mov cnt_reg, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2259 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2260 mov stride_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2261 neg mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2262 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2263 lea dst_reg, [dst_reg + stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2264 %if %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2265 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2266 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2267 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2268 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2269 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2270 .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2271 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2272 ; read |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2273 lea dst2_reg, [dst_reg + stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2274 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2275 %if %4 == 8 && mmsize == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2276 %define movrow movh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2277 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2278 %define movrow mova |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2279 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2280 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2281 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2282 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2283 movrow m5, [dst2_reg] ; q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2284 movrow m6, [dst2_reg+ stride_reg] ; q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2285 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2286 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2287 movhps m0, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2288 movhps m2, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2289 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2290 movhps m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2291 movhps m5, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2292 movhps m6, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2293 movhps m7, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2294 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2295 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2296 %elif mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2297 ; read 8 rows of 8px each |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2298 movu m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2299 movu m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2300 movu m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2301 movu m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2302 movu m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2303 movu m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2304 movu m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2305 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2306 ; 8x8 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2307 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2308 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2309 movu m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2310 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2311 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2312 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2313 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2314 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2315 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2316 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2317 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2318 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2319 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2320 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2321 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2322 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2323 %if %4 == 16 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2324 lea dst8_reg, [dst_reg + stride_reg*8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2325 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2326 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2327 ; read 16 rows of 8px each, interleave |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2328 movh m0, [dst_reg +mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2329 movh m1, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2330 movh m2, [dst_reg +mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2331 movh m5, [dst8_reg+mstride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2332 movh m3, [dst_reg +mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2333 movh m6, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2334 movh m4, [dst_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2335 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2336 punpcklbw m0, m1 ; A/I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2337 punpcklbw m2, m5 ; C/K |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2338 punpcklbw m3, m6 ; D/L |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2339 punpcklbw m4, m7 ; E/M |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2340 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2341 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2342 movh m1, [dst2_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2343 movh m6, [dst8_reg+mstride_reg*4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2344 movh m5, [dst2_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2345 movh m7, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2346 punpcklbw m1, m6 ; B/J |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2347 punpcklbw m5, m7 ; F/N |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2348 movh m6, [dst2_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2349 movh m7, [dst8_reg+ stride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2350 punpcklbw m6, m7 ; G/O |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2351 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2352 ; 8x16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2353 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2354 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2355 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2356 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2357 mova q0backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2358 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2359 movh m7, [dst2_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2360 movh m1, [dst8_reg+ stride_reg*2] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2361 punpcklbw m7, m1 ; H/P |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2362 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2363 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2364 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2365 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2366 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2367 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2368 SWAP 2, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2369 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2370 mova m1, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2371 mova q0backup, m2 ; store q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2372 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2373 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2374 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2375 SWAP 5, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2376 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2377 mova p0backup, m5 ; store p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2378 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2379 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2380 SWAP 2, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2381 SWAP 6, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2382 SWAP 5, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2383 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2384 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2385 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2386 mova m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2387 SWAP 4, 1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2388 psubusb m4, m0 ; p2-p3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2389 psubusb m0, m1 ; p3-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2390 por m0, m4 ; abs(p3-p2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2391 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2392 mova m4, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2393 SWAP 4, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2394 psubusb m4, m1 ; p1-p2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2395 mova p2backup, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2396 psubusb m1, m2 ; p2-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2397 por m1, m4 ; abs(p2-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2398 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2399 mova m4, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2400 SWAP 4, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2401 psubusb m4, m7 ; q2-q3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2402 psubusb m7, m6 ; q3-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2403 por m7, m4 ; abs(q3-q2) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2404 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2405 mova m4, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2406 SWAP 4, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2407 psubusb m4, m6 ; q1-q2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2408 mova q2backup, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2409 psubusb m6, m5 ; q2-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2410 por m6, m4 ; abs(q2-q1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2411 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2412 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2413 mova m4, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2414 pxor m3, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2415 psubusb m0, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2416 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2417 psubusb m7, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2418 psubusb m6, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2419 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2420 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2421 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2422 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2423 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2424 pand m7, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2425 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2426 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2427 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2428 pmaxub m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2429 pmaxub m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2430 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2431 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2432 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2433 SWAP 7, 3 ; now m7 is zero |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2434 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2435 movrow m3, [dst_reg +mstride_reg] ; p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2436 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2437 movhps m3, [dst8_reg+mstride_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2438 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2439 %elifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2440 SWAP 3, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2441 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2442 mova m3, p0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2443 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2444 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2445 mova m1, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2446 SWAP 1, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2447 mova m6, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2448 SWAP 3, 6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2449 psubusb m1, m3 ; p1-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2450 psubusb m6, m2 ; p0-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2451 por m1, m6 ; abs(p1-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2452 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2453 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2454 psubusb m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2455 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2456 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2457 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2458 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2459 mova mask_res, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2460 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2461 pmaxub m0, m1 ; max_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2462 SWAP 1, 4 ; max_hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2463 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2464 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2465 SWAP 6, 4 ; now m6 is I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2466 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2467 movrow m4, [dst_reg] ; q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2468 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2469 movhps m4, [dst8_reg] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2470 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2471 %elifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2472 SWAP 4, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2473 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2474 mova m4, q0backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2475 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2476 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2477 SWAP 1, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2478 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2479 SWAP 7, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2480 psubusb m1, m5 ; q0-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2481 psubusb m7, m4 ; q1-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2482 por m1, m7 ; abs(q1-q0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2483 %ifidn %1, mmx |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2484 mova m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2485 psubusb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2486 psubusb m7, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2487 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2488 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2489 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2490 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2491 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2492 pand m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2493 %else ; mmxext/sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2494 pxor m7, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2495 pmaxub m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2496 pmaxub m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2497 psubusb m0, flim_I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2498 psubusb m6, hev_thr |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2499 pcmpeqb m0, m7 ; max(abs(..)) <= I |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2500 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2501 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2502 %ifdef m12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2503 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2504 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2505 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2506 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2507 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2508 ; simple_limit |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2509 mova m1, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2510 SWAP 1, 3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2511 mova m6, m4 ; keep copies of p0/q0 around for later use |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2512 SWAP 6, 4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2513 psubusb m1, m4 ; p0-q0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2514 psubusb m6, m3 ; q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2515 por m1, m6 ; abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2516 paddusb m1, m1 ; m1=2*abs(q0-p0) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2517 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2518 mova m7, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2519 SWAP 7, 2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2520 mova m6, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2521 SWAP 6, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2522 psubusb m7, m5 ; p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2523 psubusb m6, m2 ; q1-p1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2524 por m7, m6 ; abs(q1-p1) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2525 pxor m6, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2526 pand m7, [pb_FE] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2527 psrlq m7, 1 ; abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2528 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2529 psubusb m7, flim_E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2530 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2531 pand m0, m7 ; normal_limit result |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2532 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2533 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2534 %ifdef m8 ; x86-64 && sse2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2535 mova m8, [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2536 %define pb_80_var m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2537 %else ; x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2538 %define pb_80_var [pb_80] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2539 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2540 mova m1, m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2541 mova m7, m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2542 pxor m1, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2543 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2544 psubsb m1, m7 ; (signed) q0-p0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2545 mova m6, m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2546 mova m7, m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2547 pxor m6, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2548 pxor m7, pb_80_var |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2549 psubsb m6, m7 ; (signed) p1-q1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2550 mova m7, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2551 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2552 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2553 paddsb m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2554 pand m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2555 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2556 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2557 pand lim_res, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2558 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2559 mova m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2560 pand m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2561 mova lim_res, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2562 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2563 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2564 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2565 mova m1, [pb_F8] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2566 mova m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2567 paddsb m7, [pb_3] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2568 paddsb m6, [pb_4] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2569 pand m7, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2570 pand m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2571 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2572 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2573 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2574 pcmpgtb m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2575 psubb m0, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2576 psrlq m7, 3 ; +f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2577 psrlq m0, 3 ; -f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2578 pand m0, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2579 pandn m1, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2580 psubusb m3, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2581 paddusb m3, m1 ; p0+f2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2582 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2583 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2584 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2585 pcmpgtb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2586 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2587 psrlq m6, 3 ; +f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2588 psrlq m1, 3 ; -f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2589 pand m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2590 pandn m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2591 psubusb m4, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2592 paddusb m4, m1 ; q0-f1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2593 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2594 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2595 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2596 mova m7, [pb_1] |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2597 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2598 mova m7, [pw_63] |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2599 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2600 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2601 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2602 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2603 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2604 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2605 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2606 mova m6, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2607 pcmpgtb m0, m1 ; which are negative |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2608 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2609 punpcklbw m6, m7 ; interleave with "1" for rounding |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2610 punpckhbw m1, m7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2611 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2612 punpcklbw m6, m0 ; signed byte->word |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2613 punpckhbw m1, m0 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2614 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2615 mova lim_sign, m0 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2616 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2617 mova m7, [pb_27_63] |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2618 %ifndef m8 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2619 mova lim_res, m1 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2620 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2621 %ifdef m10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2622 SWAP 0, 10 ; don't lose lim_sign copy |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2623 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2624 mova m0, m7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2625 pmaddubsw m7, m6 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2626 SWAP 6, 7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2627 pmaddubsw m0, m1 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2628 SWAP 1, 0 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2629 %ifdef m10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2630 SWAP 0, 10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2631 %else |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2632 mova m0, lim_sign |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2633 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2634 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2635 mova mask_res, m6 ; backup for later in filter |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2636 mova lim_res, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2637 pmullw m6, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2638 pmullw m1, [pw_27] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2639 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2640 paddw m1, m7 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2641 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2642 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2643 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2644 packsswb m6, m1 ; a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2645 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2646 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2647 pand m1, m0 ; -a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2648 pandn m0, m6 ; +a0 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2649 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2650 mova m6, [pb_18_63] ; pipelining |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2651 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2652 psubusb m3, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2653 paddusb m4, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2654 paddusb m3, m0 ; p0+a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2655 psubusb m4, m0 ; q0-a0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2656 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2657 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2658 SWAP 6, 7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2659 %ifdef m10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2660 SWAP 1, 10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2661 %else |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2662 mova m1, lim_res |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2663 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2664 mova m0, m7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2665 pmaddubsw m7, m6 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2666 SWAP 6, 7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2667 pmaddubsw m0, m1 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2668 SWAP 1, 0 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2669 %ifdef m10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2670 SWAP 0, 10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2671 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2672 mova m0, lim_sign |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2673 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2674 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2675 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2676 pmullw m6, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2677 pmullw m1, [pw_18] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2678 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2679 paddw m1, m7 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2680 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2681 mova m0, lim_sign |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2682 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2683 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2684 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2685 pxor m1, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2686 psubb m1, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2687 pand m1, m0 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2688 pandn m0, m6 ; +a1 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2689 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2690 mova m6, [pb_9_63] |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2691 %endif |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2692 psubusb m2, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2693 paddusb m5, m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2694 paddusb m2, m0 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2695 psubusb m5, m0 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2696 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2697 %if ssse3_or_higher |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2698 SWAP 6, 7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2699 %ifdef m10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2700 SWAP 1, 10 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2701 %else |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2702 mova m1, lim_res |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2703 %endif |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2704 mova m0, m7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2705 pmaddubsw m7, m6 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2706 SWAP 6, 7 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2707 pmaddubsw m0, m1 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2708 SWAP 1, 0 |
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2709 %else |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2710 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2711 SWAP 6, 12 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2712 SWAP 1, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2713 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2714 mova m6, mask_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2715 mova m1, lim_res |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2716 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2717 pmullw m6, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2718 pmullw m1, [pw_9] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2719 paddw m6, m7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2720 paddw m1, m7 |
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2721 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2722 %ifdef m9 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2723 SWAP 7, 9 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2724 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2725 mova m7, lim_sign |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2726 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2727 psraw m6, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2728 psraw m1, 7 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2729 packsswb m6, m1 ; a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2730 pxor m0, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2731 psubb m0, m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2732 pand m0, m7 ; -a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2733 pandn m7, m6 ; +a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2734 %ifdef m8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2735 SWAP 1, 13 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2736 SWAP 6, 14 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2737 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2738 mova m1, p2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2739 mova m6, q2backup |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2740 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2741 psubusb m1, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2742 paddusb m6, m0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2743 paddusb m1, m7 ; p1+a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2744 psubusb m6, m7 ; q1-a1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2745 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2746 ; store |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2747 %ifidn %2, v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2748 movrow [dst2_reg+mstride_reg*4], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2749 movrow [dst_reg +mstride_reg*2], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2750 movrow [dst_reg +mstride_reg ], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2751 movrow [dst_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2752 movrow [dst2_reg], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2753 movrow [dst2_reg+ stride_reg ], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2754 %if mmsize == 16 && %4 == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2755 add dst8_reg, mstride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2756 movhps [dst8_reg+mstride_reg*2], m1 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2757 movhps [dst8_reg+mstride_reg ], m2 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2758 movhps [dst8_reg], m3 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2759 add dst8_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2760 movhps [dst8_reg], m4 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2761 movhps [dst8_reg+ stride_reg ], m5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2762 movhps [dst8_reg+ stride_reg*2], m6 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2763 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2764 %else ; h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2765 inc dst_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2766 inc dst2_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2767 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2768 ; 4x8/16 transpose |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2769 TRANSPOSE4x4B 1, 2, 3, 4, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2770 SBUTTERFLY bw, 5, 6, 0 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2771 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2772 %if mmsize == 8 ; mmx/mmxext (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2773 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2774 add dst_reg, 4 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2775 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2776 %else ; sse2 (h) |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2777 lea dst8_reg, [dst8_reg+mstride_reg+1] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2778 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
12214
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2779 lea dst_reg, [dst2_reg+mstride_reg+4] |
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2780 lea dst8_reg, [dst8_reg+mstride_reg+4] |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2781 %ifidn %1, sse4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2782 add dst2_reg, 4 |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2783 %endif |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2784 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg |
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2785 %ifidn %1, sse4 |
12268
259988e7ad0f
Fix obvious bug in assignment. Somehow, the test vectors don't test this...
rbultje
parents:
12266
diff
changeset
|
2786 lea dst2_reg, [dst8_reg+ stride_reg] |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2787 %endif |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2788 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2789 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2790 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2791 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2792 %if mmsize == 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2793 %if %4 == 8 ; chroma |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2794 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2795 sub dst_reg, 5 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2796 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2797 cmp dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2798 mov dst_reg, dst8_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2799 jnz .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2800 %else |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2801 %ifidn %2, h |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2802 lea dst_reg, [dst_reg + stride_reg*8-5] |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2803 %else ; v |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2804 add dst_reg, 8 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2805 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2806 dec cnt_reg |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2807 jg .next8px |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2808 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2809 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2810 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2811 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2812 mov rsp, stack_reg ; restore stack pointer |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2813 %endif |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2814 RET |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2815 %endmacro |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2816 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2817 INIT_MMX |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2818 %define SPLATB_REG SPLATB_REG_MMX |
12210 | 2819 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
2820 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2821 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2822 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2823 |
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2824 %define SPLATB_REG SPLATB_REG_MMXEXT |
12210 | 2825 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |
2826 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | |
2827 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | |
2828 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2829 |
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2830 INIT_XMM |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2831 %define SPLATB_REG SPLATB_REG_SSE2 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2832 %define WRITE_8W WRITE_8W_SSE2 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2833 MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2834 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2835 MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2836 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2837 MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 |
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2838 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2839 MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2840 MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 |
12210 | 2841 |
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2842 %define SPLATB_REG SPLATB_REG_SSSE3 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2843 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 |
12210 | 2844 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2845 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 |
12210 | 2846 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2847 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 |
12210 | 2848 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2849 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 |
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2850 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2851 |
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2852 %define WRITE_8W WRITE_8W_SSE4 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2853 %ifdef m8 |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2854 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2855 %else |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2856 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 |
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2857 %endif |
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2858 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 |