Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12083:dd5efc28bca9 libavcodec
Give context to av_log
author | bcoudurier |
---|---|
date | Sat, 03 Jul 2010 04:40:12 +0000 |
parents | 8527154f6e81 |
children | d780ae746855 |
rev | line source |
---|---|
11975 | 1 ;****************************************************************************** |
2 ;* VP8 MMXEXT optimizations | |
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
11975 | 25 |
26 SECTION_RODATA | |
27 | |
28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
29 times 4 dw 12, -1 | |
30 times 4 dw -9, 93 | |
31 times 4 dw 50, -6 | |
32 times 4 dw -6, 50 | |
33 times 4 dw 93, -9 | |
34 times 4 dw -1, 12 | |
35 times 4 dw 123, -6 | |
36 | |
37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
38 times 4 dw 108, 36 | |
39 times 4 dw -8, 1 | |
40 times 4 dw 3, -16 | |
41 times 4 dw 77, 77 | |
42 times 4 dw -16, 3 | |
43 times 4 dw 1, -8 | |
44 times 4 dw 36, 108 | |
45 times 4 dw -11, 2 | |
46 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
11975 | 55 |
56 sixtap_filter_hb_m: times 8 db 2, 1 | |
57 times 8 db -11, 108 | |
58 times 8 db 36, -8 | |
59 times 8 db 3, 3 | |
60 times 8 db -16, 77 | |
61 times 8 db 77, -16 | |
62 times 8 db 1, 2 | |
63 times 8 db -8, 36 | |
64 times 8 db 108, -11 | |
65 | |
66 fourtap_filter_v_m: times 8 dw -6 | |
67 times 8 dw 123 | |
68 times 8 dw 12 | |
69 times 8 dw -1 | |
70 times 8 dw -9 | |
71 times 8 dw 93 | |
72 times 8 dw 50 | |
73 times 8 dw -6 | |
74 times 8 dw -6 | |
75 times 8 dw 50 | |
76 times 8 dw 93 | |
77 times 8 dw -9 | |
78 times 8 dw -1 | |
79 times 8 dw 12 | |
80 times 8 dw 123 | |
81 times 8 dw -6 | |
82 | |
83 sixtap_filter_v_m: times 8 dw 2 | |
84 times 8 dw -11 | |
85 times 8 dw 108 | |
86 times 8 dw 36 | |
87 times 8 dw -8 | |
88 times 8 dw 1 | |
89 times 8 dw 3 | |
90 times 8 dw -16 | |
91 times 8 dw 77 | |
92 times 8 dw 77 | |
93 times 8 dw -16 | |
94 times 8 dw 3 | |
95 times 8 dw 1 | |
96 times 8 dw -8 | |
97 times 8 dw 36 | |
98 times 8 dw 108 | |
99 times 8 dw -11 | |
100 times 8 dw 2 | |
101 | |
11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
103 times 8 dw 2 | |
104 times 8 dw 3 | |
105 times 8 dw 4 | |
106 times 8 dw 5 | |
107 times 8 dw 6 | |
108 times 8 dw 7 | |
109 | |
110 bilinear_filter_vb_m: times 8 db 7, 1 | |
111 times 8 db 6, 2 | |
112 times 8 db 5, 3 | |
113 times 8 db 4, 4 | |
114 times 8 db 3, 5 | |
115 times 8 db 2, 6 | |
116 times 8 db 1, 7 | |
117 | |
11975 | 118 %ifdef PIC |
11991 | 119 %define fourtap_filter_hw r11 |
120 %define sixtap_filter_hw r11 | |
121 %define fourtap_filter_hb r11 | |
122 %define sixtap_filter_hb r11 | |
123 %define fourtap_filter_v r11 | |
124 %define sixtap_filter_v r11 | |
125 %define bilinear_filter_vw r11 | |
126 %define bilinear_filter_vb r11 | |
11975 | 127 %else |
128 %define fourtap_filter_hw fourtap_filter_hw_m | |
129 %define sixtap_filter_hw sixtap_filter_hw_m | |
130 %define fourtap_filter_hb fourtap_filter_hb_m | |
131 %define sixtap_filter_hb sixtap_filter_hb_m | |
132 %define fourtap_filter_v fourtap_filter_v_m | |
133 %define sixtap_filter_v sixtap_filter_v_m | |
11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
135 %define bilinear_filter_vb bilinear_filter_vb_m | |
11975 | 136 %endif |
137 | |
11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
11975 | 140 |
11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
11975 | 144 |
12013 | 145 pw_20091: times 4 dw 20091 |
146 pw_17734: times 4 dw 17734 | |
147 | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
148 cextern pw_3 |
11975 | 149 cextern pw_4 |
150 cextern pw_64 | |
151 | |
152 SECTION .text | |
153 | |
154 ;----------------------------------------------------------------------------- | |
155 ; subpel MC functions: | |
156 ; | |
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
158 ; uint8_t *src, int srcstride, | |
159 ; int height, int mx, int my); | |
160 ;----------------------------------------------------------------------------- | |
161 | |
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
162 %macro FILTER_SSSE3 3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
163 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
164 lea r5d, [r5*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
165 mova m3, [filter_h6_shuf2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
166 mova m4, [filter_h6_shuf3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
167 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
168 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
169 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
170 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
171 mova m6, [sixtap_filter_hb+r5*8-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
172 mova m7, [sixtap_filter_hb+r5*8-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
173 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
174 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
175 movu m0, [r2-2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 mova m2, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 %ifidn %1, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 ; shuffle with a memory operand |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 punpcklbw m0, [r2+3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 %else |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 pshufb m0, [filter_h6_shuf1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 pshufb m1, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 pshufb m2, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 pmaddubsw m2, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 paddsw m0, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 shl r5d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 mova m2, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 mova m3, [filter_h2_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 mova m4, [filter_h4_shuf] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 mova m6, [fourtap_filter_hb+r5] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
214 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 movu m0, [r2-1] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 mova m1, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 pshufb m0, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 pshufb m1, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 pmaddubsw m0, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 pmaddubsw m1, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 paddsw m0, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 paddsw m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 psraw m0, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 packuswb m0, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 movh [r0], m0 ; store |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 shl r6d, 4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 lea r11, [fourtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 mova m5, [fourtap_filter_hb+r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 mova m6, [fourtap_filter_hb+r6] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 mova m7, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 ; read 3 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
245 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 movh m1, [r2+ r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 movh m2, [r2+2*r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 movh m3, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 mova m4, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 punpcklbw m4, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 punpcklbw m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 pmaddubsw m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 pmaddubsw m2, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 paddsw m4, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 paddsw m4, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 psraw m4, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 packuswb m4, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 movh [r0], m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 lea r6d, [r6*3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 %ifdef PIC |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 lea r11, [sixtap_filter_hb_m] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 %endif |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 lea r6, [sixtap_filter_hb+r6*8] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 ; read 5 lines |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 sub r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
284 movh m0, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 movh m1, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 movh m2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 lea r2, [r2+r3*2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 movh m3, [r2] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 movh m4, [r2+r3] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 movh m5, [r2+2*r3] ; read new row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 mova m6, m0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 punpcklbw m6, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 mova m0, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 punpcklbw m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 mova m7, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 punpcklbw m7, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 pmaddubsw m6, [r6-48] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 pmaddubsw m1, [r6-32] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 pmaddubsw m7, [r6-16] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 paddsw m6, m1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 paddsw m6, m7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 mova m1, m2 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 paddsw m6, [pw_64] |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 mova m2, m3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 psraw m6, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 mova m3, m4 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 packuswb m6, m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 mova m4, m5 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 movh [r0], m6 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 ; go to next line |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 add r0, r1 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 add r2, r3 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 dec r4 ; next row |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 jg .nextrow |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 REP_RET |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 %endmacro |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 INIT_MMX |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 FILTER_SSSE3 4, 0, 0 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 INIT_XMM |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 FILTER_SSSE3 8, 8, 7 |
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 |
11975 | 327 ; 4x4 block, H-only 4-tap filter |
328 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
329 shl r5d, 4 | |
330 %ifdef PIC | |
331 lea r11, [fourtap_filter_hw_m] | |
332 %endif | |
333 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
334 movq mm5, [fourtap_filter_hw+r5] | |
335 movq mm7, [pw_64] | |
336 pxor mm6, mm6 | |
337 | |
338 .nextrow | |
339 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
340 | |
341 ; first set of 2 pixels | |
342 movq mm2, mm1 ; byte ABCD.. | |
343 punpcklbw mm1, mm6 ; byte->word ABCD | |
344 pshufw mm0, mm2, 9 ; byte CDEF.. | |
345 punpcklbw mm0, mm6 ; byte->word CDEF | |
346 pshufw mm3, mm1, 0x94 ; word ABBC | |
347 pshufw mm1, mm0, 0x94 ; word CDDE | |
348 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
349 movq mm0, mm1 ; backup for second set of pixels | |
350 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
351 paddd mm3, mm1 ; finish 1st 2px | |
352 | |
353 ; second set of 2 pixels, use backup of above | |
354 punpckhbw mm2, mm6 ; byte->word EFGH | |
355 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
356 pshufw mm1, mm2, 0x94 ; word EFFG | |
357 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
358 paddd mm0, mm1 ; finish 2nd 2px | |
359 | |
360 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
361 packssdw mm3, mm0 ; merge dword->word (4px) | |
362 paddsw mm3, mm7 ; rounding | |
363 psraw mm3, 7 | |
364 packuswb mm3, mm6 ; clip and word->bytes | |
365 movd [r0], mm3 ; store | |
366 | |
367 ; go to next line | |
368 add r0, r1 | |
369 add r2, r3 | |
370 dec r4 ; next row | |
371 jg .nextrow | |
372 REP_RET | |
373 | |
374 ; 4x4 block, H-only 6-tap filter | |
375 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
376 lea r5d, [r5*3] | |
377 %ifdef PIC | |
378 lea r11, [sixtap_filter_hw_m] | |
379 %endif | |
380 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
381 movq mm5, [sixtap_filter_hw+r5*8-32] | |
382 movq mm6, [sixtap_filter_hw+r5*8-16] | |
383 movq mm7, [pw_64] | |
384 pxor mm3, mm3 | |
385 | |
386 .nextrow | |
387 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
388 | |
389 ; first set of 2 pixels | |
390 movq mm2, mm1 ; byte ABCD.. | |
391 punpcklbw mm1, mm3 ; byte->word ABCD | |
392 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
393 punpckhbw mm2, mm3 ; byte->word EFGH | |
394 punpcklbw mm0, mm3 ; byte->word CDEF | |
395 pshufw mm1, mm1, 0x94 ; word ABBC | |
396 pshufw mm2, mm2, 0x94 ; word EFFG | |
397 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
398 pshufw mm3, mm0, 0x94 ; word CDDE | |
399 movq mm0, mm3 ; backup for second set of pixels | |
400 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
401 paddd mm1, mm3 ; add to 1st 2px cache | |
402 movq mm3, mm2 ; backup for second set of pixels | |
403 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
404 paddd mm1, mm2 ; finish 1st 2px | |
405 | |
406 ; second set of 2 pixels, use backup of above | |
407 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
408 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
409 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
410 paddd mm0, mm3 ; add to 2nd 2px cache | |
411 pxor mm3, mm3 | |
412 punpcklbw mm2, mm3 ; byte->word FGHI | |
413 pshufw mm2, mm2, 0xE9 ; word GHHI | |
414 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
415 paddd mm0, mm2 ; finish 2nd 2px | |
416 | |
417 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
418 packssdw mm1, mm0 ; merge dword->word (4px) | |
419 paddsw mm1, mm7 ; rounding | |
420 psraw mm1, 7 | |
421 packuswb mm1, mm3 ; clip and word->bytes | |
422 movd [r0], mm1 ; store | |
423 | |
424 ; go to next line | |
425 add r0, r1 | |
426 add r2, r3 | |
427 dec r4 ; next row | |
428 jg .nextrow | |
429 REP_RET | |
430 | |
431 ; 4x4 block, H-only 4-tap filter | |
432 INIT_XMM | |
433 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 | |
434 shl r5d, 4 | |
435 %ifdef PIC | |
436 lea r11, [fourtap_filter_hw_m] | |
437 %endif | |
438 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
439 mova m6, [fourtap_filter_hw+r5] | |
440 pxor m7, m7 | |
441 | |
442 .nextrow | |
443 movh m0, [r2-1] | |
444 punpcklbw m0, m7 ; ABCDEFGH | |
445 mova m1, m0 | |
446 mova m2, m0 | |
447 mova m3, m0 | |
448 psrldq m1, 2 ; BCDEFGH | |
449 psrldq m2, 4 ; CDEFGH | |
450 psrldq m3, 6 ; DEFGH | |
451 punpcklwd m0, m1 ; ABBCCDDE | |
452 punpcklwd m2, m3 ; CDDEEFFG | |
453 pmaddwd m0, m5 | |
454 pmaddwd m2, m6 | |
455 paddd m0, m2 | |
456 | |
457 movh m1, [r2+3] | |
458 punpcklbw m1, m7 ; ABCDEFGH | |
459 mova m2, m1 | |
460 mova m3, m1 | |
461 mova m4, m1 | |
462 psrldq m2, 2 ; BCDEFGH | |
463 psrldq m3, 4 ; CDEFGH | |
464 psrldq m4, 6 ; DEFGH | |
465 punpcklwd m1, m2 ; ABBCCDDE | |
466 punpcklwd m3, m4 ; CDDEEFFG | |
467 pmaddwd m1, m5 | |
468 pmaddwd m3, m6 | |
469 paddd m1, m3 | |
470 | |
471 packssdw m0, m1 | |
472 paddsw m0, [pw_64] | |
473 psraw m0, 7 | |
474 packuswb m0, m7 | |
475 movh [r0], m0 ; store | |
476 | |
477 ; go to next line | |
478 add r0, r1 | |
479 add r2, r3 | |
480 dec r4 ; next row | |
481 jg .nextrow | |
482 REP_RET | |
483 | |
484 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 | |
485 lea r5d, [r5*3] | |
486 %ifdef PIC | |
487 lea r11, [sixtap_filter_hw_m] | |
488 %endif | |
489 lea r5, [sixtap_filter_hw+r5*8] | |
490 pxor m7, m7 | |
491 | |
492 .nextrow | |
493 movu m0, [r2-2] | |
494 mova m6, m0 | |
495 mova m4, m0 | |
496 punpcklbw m0, m7 ; ABCDEFGHI | |
497 mova m1, m0 | |
498 mova m2, m0 | |
499 mova m3, m0 | |
500 psrldq m1, 2 ; BCDEFGH | |
501 psrldq m2, 4 ; CDEFGH | |
502 psrldq m3, 6 ; DEFGH | |
503 psrldq m4, 4 | |
504 punpcklbw m4, m7 ; EFGH | |
505 mova m5, m4 | |
506 psrldq m5, 2 ; FGH | |
507 punpcklwd m0, m1 ; ABBCCDDE | |
508 punpcklwd m2, m3 ; CDDEEFFG | |
509 punpcklwd m4, m5 ; EFFGGHHI | |
510 pmaddwd m0, [r5-48] | |
511 pmaddwd m2, [r5-32] | |
512 pmaddwd m4, [r5-16] | |
513 paddd m0, m2 | |
514 paddd m0, m4 | |
515 | |
516 psrldq m6, 4 | |
517 mova m4, m6 | |
518 punpcklbw m6, m7 ; ABCDEFGHI | |
519 mova m1, m6 | |
520 mova m2, m6 | |
521 mova m3, m6 | |
522 psrldq m1, 2 ; BCDEFGH | |
523 psrldq m2, 4 ; CDEFGH | |
524 psrldq m3, 6 ; DEFGH | |
525 psrldq m4, 4 | |
526 punpcklbw m4, m7 ; EFGH | |
527 mova m5, m4 | |
528 psrldq m5, 2 ; FGH | |
529 punpcklwd m6, m1 ; ABBCCDDE | |
530 punpcklwd m2, m3 ; CDDEEFFG | |
531 punpcklwd m4, m5 ; EFFGGHHI | |
532 pmaddwd m6, [r5-48] | |
533 pmaddwd m2, [r5-32] | |
534 pmaddwd m4, [r5-16] | |
535 paddd m6, m2 | |
536 paddd m6, m4 | |
537 | |
538 packssdw m0, m6 | |
539 paddsw m0, [pw_64] | |
540 psraw m0, 7 | |
541 packuswb m0, m7 | |
542 movh [r0], m0 ; store | |
543 | |
544 ; go to next line | |
545 add r0, r1 | |
546 add r2, r3 | |
547 dec r4 ; next row | |
548 jg .nextrow | |
549 REP_RET | |
550 | |
551 %macro FILTER_V 3 | |
552 ; 4x4 block, V-only 4-tap filter | |
553 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
554 shl r6d, 5 | |
555 %ifdef PIC | |
556 lea r11, [fourtap_filter_v_m] | |
557 %endif | |
558 lea r6, [fourtap_filter_v+r6-32] | |
559 mova m6, [pw_64] | |
560 pxor m7, m7 | |
561 mova m5, [r6+48] | |
562 | |
563 ; read 3 lines | |
564 sub r2, r3 | |
565 movh m0, [r2] | |
566 movh m1, [r2+ r3] | |
567 movh m2, [r2+2*r3] | |
568 add r2, r3 | |
569 punpcklbw m0, m7 | |
570 punpcklbw m1, m7 | |
571 punpcklbw m2, m7 | |
572 | |
573 .nextrow | |
574 ; first calculate negative taps (to prevent losing positive overflows) | |
575 movh m4, [r2+2*r3] ; read new row | |
576 punpcklbw m4, m7 | |
577 mova m3, m4 | |
578 pmullw m0, [r6+0] | |
579 pmullw m4, m5 | |
580 paddsw m4, m0 | |
581 | |
582 ; then calculate positive taps | |
583 mova m0, m1 | |
584 pmullw m1, [r6+16] | |
585 paddsw m4, m1 | |
586 mova m1, m2 | |
587 pmullw m2, [r6+32] | |
588 paddsw m4, m2 | |
589 mova m2, m3 | |
590 | |
591 ; round/clip/store | |
592 paddsw m4, m6 | |
593 psraw m4, 7 | |
594 packuswb m4, m7 | |
595 movh [r0], m4 | |
596 | |
597 ; go to next line | |
598 add r0, r1 | |
599 add r2, r3 | |
600 dec r4 ; next row | |
601 jg .nextrow | |
602 REP_RET | |
603 | |
604 | |
605 ; 4x4 block, V-only 6-tap filter | |
606 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
607 shl r6d, 4 | |
608 lea r6, [r6*3] | |
609 %ifdef PIC | |
610 lea r11, [sixtap_filter_v_m] | |
611 %endif | |
612 lea r6, [sixtap_filter_v+r6-96] | |
613 pxor m7, m7 | |
614 | |
615 ; read 5 lines | |
616 sub r2, r3 | |
617 sub r2, r3 | |
618 movh m0, [r2] | |
619 movh m1, [r2+r3] | |
620 movh m2, [r2+r3*2] | |
621 lea r2, [r2+r3*2] | |
622 add r2, r3 | |
623 movh m3, [r2] | |
624 movh m4, [r2+r3] | |
625 punpcklbw m0, m7 | |
626 punpcklbw m1, m7 | |
627 punpcklbw m2, m7 | |
628 punpcklbw m3, m7 | |
629 punpcklbw m4, m7 | |
630 | |
631 .nextrow | |
632 ; first calculate negative taps (to prevent losing positive overflows) | |
633 mova m5, m1 | |
634 pmullw m5, [r6+16] | |
635 mova m6, m4 | |
636 pmullw m6, [r6+64] | |
637 paddsw m6, m5 | |
638 | |
639 ; then calculate positive taps | |
640 movh m5, [r2+2*r3] ; read new row | |
641 punpcklbw m5, m7 | |
642 pmullw m0, [r6+0] | |
643 paddsw m6, m0 | |
644 mova m0, m1 | |
645 mova m1, m2 | |
646 pmullw m2, [r6+32] | |
647 paddsw m6, m2 | |
648 mova m2, m3 | |
649 pmullw m3, [r6+48] | |
650 paddsw m6, m3 | |
651 mova m3, m4 | |
652 mova m4, m5 | |
653 pmullw m5, [r6+80] | |
654 paddsw m6, m5 | |
655 | |
656 ; round/clip/store | |
657 paddsw m6, [pw_64] | |
658 psraw m6, 7 | |
659 packuswb m6, m7 | |
660 movh [r0], m6 | |
661 | |
662 ; go to next line | |
663 add r0, r1 | |
664 add r2, r3 | |
665 dec r4 ; next row | |
666 jg .nextrow | |
667 REP_RET | |
668 %endmacro | |
669 | |
670 INIT_MMX | |
671 FILTER_V mmxext, 4, 0 | |
672 INIT_XMM | |
673 FILTER_V sse2, 8, 8 | |
674 | |
11991 | 675 %macro FILTER_BILINEAR 3 |
676 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
677 mov r5d, 8*16 | |
678 shl r6d, 4 | |
679 sub r5d, r6d | |
680 %ifdef PIC | |
681 lea r11, [bilinear_filter_vw_m] | |
682 %endif | |
683 pxor m6, m6 | |
12000 | 684 mova m4, [bilinear_filter_vw+r5-16] |
685 mova m5, [bilinear_filter_vw+r6-16] | |
11991 | 686 .nextrow |
687 movh m0, [r2+r3*0] | |
688 movh m1, [r2+r3*1] | |
689 movh m3, [r2+r3*2] | |
690 punpcklbw m0, m6 | |
691 punpcklbw m1, m6 | |
692 punpcklbw m3, m6 | |
693 mova m2, m1 | |
694 pmullw m0, m4 | |
695 pmullw m1, m5 | |
696 pmullw m2, m4 | |
697 pmullw m3, m5 | |
698 paddsw m0, m1 | |
699 paddsw m2, m3 | |
700 psraw m0, 2 | |
701 psraw m2, 2 | |
702 pavgw m0, m6 | |
703 pavgw m2, m6 | |
704 %ifidn %1, mmxext | |
705 packuswb m0, m0 | |
706 packuswb m2, m2 | |
707 movh [r0+r1*0], m0 | |
708 movh [r0+r1*1], m2 | |
709 %else | |
710 packuswb m0, m2 | |
711 movh [r0+r1*0], m0 | |
712 movhps [r0+r1*1], m0 | |
713 %endif | |
714 | |
715 lea r0, [r0+r1*2] | |
716 lea r2, [r2+r3*2] | |
717 sub r4, 2 | |
718 jg .nextrow | |
719 REP_RET | |
720 | |
721 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
722 mov r6d, 8*16 | |
723 shl r5d, 4 | |
724 sub r6d, r5d | |
725 %ifdef PIC | |
726 lea r11, [bilinear_filter_vw_m] | |
727 %endif | |
728 pxor m6, m6 | |
12000 | 729 mova m4, [bilinear_filter_vw+r6-16] |
730 mova m5, [bilinear_filter_vw+r5-16] | |
11991 | 731 .nextrow |
732 movh m0, [r2+r3*0+0] | |
733 movh m1, [r2+r3*0+1] | |
734 movh m2, [r2+r3*1+0] | |
735 movh m3, [r2+r3*1+1] | |
736 punpcklbw m0, m6 | |
737 punpcklbw m1, m6 | |
738 punpcklbw m2, m6 | |
739 punpcklbw m3, m6 | |
740 pmullw m0, m4 | |
741 pmullw m1, m5 | |
742 pmullw m2, m4 | |
743 pmullw m3, m5 | |
744 paddsw m0, m1 | |
745 paddsw m2, m3 | |
746 psraw m0, 2 | |
747 psraw m2, 2 | |
748 pavgw m0, m6 | |
749 pavgw m2, m6 | |
750 %ifidn %1, mmxext | |
751 packuswb m0, m0 | |
752 packuswb m2, m2 | |
753 movh [r0+r1*0], m0 | |
754 movh [r0+r1*1], m2 | |
755 %else | |
756 packuswb m0, m2 | |
757 movh [r0+r1*0], m0 | |
758 movhps [r0+r1*1], m0 | |
759 %endif | |
760 | |
761 lea r0, [r0+r1*2] | |
762 lea r2, [r2+r3*2] | |
763 sub r4, 2 | |
764 jg .nextrow | |
765 REP_RET | |
766 %endmacro | |
767 | |
768 INIT_MMX | |
769 FILTER_BILINEAR mmxext, 4, 0 | |
770 INIT_XMM | |
771 FILTER_BILINEAR sse2, 8, 7 | |
772 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
773 %macro FILTER_BILINEAR_SSSE3 1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
774 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
11991 | 775 shl r6d, 4 |
776 %ifdef PIC | |
777 lea r11, [bilinear_filter_vb_m] | |
778 %endif | |
779 pxor m4, m4 | |
12000 | 780 mova m3, [bilinear_filter_vb+r6-16] |
11991 | 781 .nextrow |
782 movh m0, [r2+r3*0] | |
783 movh m1, [r2+r3*1] | |
784 movh m2, [r2+r3*2] | |
785 punpcklbw m0, m1 | |
786 punpcklbw m1, m2 | |
787 pmaddubsw m0, m3 | |
788 pmaddubsw m1, m3 | |
789 psraw m0, 2 | |
790 psraw m1, 2 | |
791 pavgw m0, m4 | |
792 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
793 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
794 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
795 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
796 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
797 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
798 %else |
11991 | 799 packuswb m0, m1 |
800 movh [r0+r1*0], m0 | |
801 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
802 %endif |
11991 | 803 |
804 lea r0, [r0+r1*2] | |
805 lea r2, [r2+r3*2] | |
806 sub r4, 2 | |
807 jg .nextrow | |
808 REP_RET | |
809 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
810 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
11991 | 811 shl r5d, 4 |
812 %ifdef PIC | |
813 lea r11, [bilinear_filter_vb_m] | |
814 %endif | |
815 pxor m4, m4 | |
816 mova m2, [filter_h2_shuf] | |
12000 | 817 mova m3, [bilinear_filter_vb+r5-16] |
11991 | 818 .nextrow |
819 movu m0, [r2+r3*0] | |
820 movu m1, [r2+r3*1] | |
821 pshufb m0, m2 | |
822 pshufb m1, m2 | |
823 pmaddubsw m0, m3 | |
824 pmaddubsw m1, m3 | |
825 psraw m0, 2 | |
826 psraw m1, 2 | |
827 pavgw m0, m4 | |
828 pavgw m1, m4 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
829 %if mmsize==8 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
830 packuswb m0, m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
831 packuswb m1, m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
832 movh [r0+r1*0], m0 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
833 movh [r0+r1*1], m1 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
834 %else |
11991 | 835 packuswb m0, m1 |
836 movh [r0+r1*0], m0 | |
837 movhps [r0+r1*1], m0 | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
838 %endif |
11991 | 839 |
840 lea r0, [r0+r1*2] | |
841 lea r2, [r2+r3*2] | |
842 sub r4, 2 | |
843 jg .nextrow | |
844 REP_RET | |
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
845 %endmacro |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
846 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
847 INIT_MMX |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
848 FILTER_BILINEAR_SSSE3 4 |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
849 INIT_XMM |
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
850 FILTER_BILINEAR_SSSE3 8 |
11991 | 851 |
11992 | 852 cglobal put_vp8_pixels8_mmx, 5,5 |
853 .nextrow: | |
854 movq mm0, [r2+r3*0] | |
855 movq mm1, [r2+r3*1] | |
856 lea r2, [r2+r3*2] | |
857 movq [r0+r1*0], mm0 | |
858 movq [r0+r1*1], mm1 | |
859 lea r0, [r0+r1*2] | |
860 sub r4d, 2 | |
861 jg .nextrow | |
862 REP_RET | |
863 | |
864 cglobal put_vp8_pixels16_mmx, 5,5 | |
865 .nextrow: | |
866 movq mm0, [r2+r3*0+0] | |
867 movq mm1, [r2+r3*0+8] | |
868 movq mm2, [r2+r3*1+0] | |
869 movq mm3, [r2+r3*1+8] | |
870 lea r2, [r2+r3*2] | |
871 movq [r0+r1*0+0], mm0 | |
872 movq [r0+r1*0+8], mm1 | |
873 movq [r0+r1*1+0], mm2 | |
874 movq [r0+r1*1+8], mm3 | |
875 lea r0, [r0+r1*2] | |
876 sub r4d, 2 | |
877 jg .nextrow | |
878 REP_RET | |
879 | |
880 cglobal put_vp8_pixels16_sse, 5,5,2 | |
881 .nextrow: | |
882 movups xmm0, [r2+r3*0] | |
883 movups xmm1, [r2+r3*1] | |
884 lea r2, [r2+r3*2] | |
885 movaps [r0+r1*0], xmm0 | |
886 movaps [r0+r1*1], xmm1 | |
887 lea r0, [r0+r1*2] | |
888 sub r4d, 2 | |
889 jg .nextrow | |
890 REP_RET | |
891 | |
11975 | 892 ;----------------------------------------------------------------------------- |
893 ; IDCT functions: | |
894 ; | |
895 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
896 ;----------------------------------------------------------------------------- | |
897 | |
898 cglobal vp8_idct_dc_add_mmx, 3, 3 | |
899 ; load data | |
900 movd mm0, [r1] | |
901 | |
902 ; calculate DC | |
903 paddw mm0, [pw_4] | |
904 pxor mm1, mm1 | |
905 psraw mm0, 3 | |
906 psubw mm1, mm0 | |
907 packuswb mm0, mm0 | |
908 packuswb mm1, mm1 | |
909 punpcklbw mm0, mm0 | |
910 punpcklbw mm1, mm1 | |
911 punpcklwd mm0, mm0 | |
912 punpcklwd mm1, mm1 | |
913 | |
914 ; add DC | |
915 lea r1, [r0+r2*2] | |
916 movd mm2, [r0] | |
917 movd mm3, [r0+r2] | |
918 movd mm4, [r1] | |
919 movd mm5, [r1+r2] | |
920 paddusb mm2, mm0 | |
921 paddusb mm3, mm0 | |
922 paddusb mm4, mm0 | |
923 paddusb mm5, mm0 | |
924 psubusb mm2, mm1 | |
925 psubusb mm3, mm1 | |
926 psubusb mm4, mm1 | |
927 psubusb mm5, mm1 | |
928 movd [r0], mm2 | |
929 movd [r0+r2], mm3 | |
930 movd [r1], mm4 | |
931 movd [r1+r2], mm5 | |
932 RET | |
933 | |
934 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | |
935 ; load data | |
936 movd xmm0, [r1] | |
937 lea r1, [r0+r2*2] | |
938 pxor xmm1, xmm1 | |
939 movq xmm2, [pw_4] | |
940 | |
941 ; calculate DC | |
942 paddw xmm0, xmm2 | |
943 movd xmm2, [r0] | |
944 movd xmm3, [r0+r2] | |
945 movd xmm4, [r1] | |
946 movd xmm5, [r1+r2] | |
947 psraw xmm0, 3 | |
948 pshuflw xmm0, xmm0, 0 | |
949 punpcklqdq xmm0, xmm0 | |
950 punpckldq xmm2, xmm3 | |
951 punpckldq xmm4, xmm5 | |
952 punpcklbw xmm2, xmm1 | |
953 punpcklbw xmm4, xmm1 | |
954 paddw xmm2, xmm0 | |
955 paddw xmm4, xmm0 | |
956 packuswb xmm2, xmm4 | |
957 movd [r0], xmm2 | |
958 pextrd [r0+r2], xmm2, 1 | |
959 pextrd [r1], xmm2, 2 | |
960 pextrd [r1+r2], xmm2, 3 | |
961 RET | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
962 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
963 ;----------------------------------------------------------------------------- |
12013 | 964 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
965 ;----------------------------------------------------------------------------- | |
966 | |
967 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
968 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
969 %macro VP8_MULTIPLY_SUMSUB 4 | |
970 mova %3, %1 | |
971 mova %4, %2 | |
972 pmulhw %3, m6 ;20091(1) | |
973 pmulhw %4, m6 ;20091(2) | |
974 paddw %3, %1 | |
975 paddw %4, %2 | |
12018 | 976 paddw %1, %1 |
977 paddw %2, %2 | |
12013 | 978 pmulhw %1, m7 ;35468(1) |
979 pmulhw %2, m7 ;35468(2) | |
980 psubw %1, %4 | |
981 paddw %2, %3 | |
982 %endmacro | |
983 | |
984 ; calculate x0=%1+%3; x1=%1-%3 | |
985 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
986 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
987 ; %5/%6 are temporary registers | |
988 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
989 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
990 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
991 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
992 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
993 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
994 SWAP %4, %1 | |
995 SWAP %4, %3 | |
996 %endmacro | |
997 | |
998 INIT_MMX | |
999 cglobal vp8_idct_add_mmx, 3, 3 | |
1000 ; load block data | |
1001 movq m0, [r1] | |
1002 movq m1, [r1+8] | |
1003 movq m2, [r1+16] | |
1004 movq m3, [r1+24] | |
1005 movq m6, [pw_20091] | |
1006 movq m7, [pw_17734] | |
1007 | |
1008 ; actual IDCT | |
1009 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1010 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1011 paddw m0, [pw_4] | |
1012 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
1013 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
1014 | |
1015 ; store | |
1016 pxor m4, m4 | |
1017 lea r1, [r0+2*r2] | |
1018 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
1019 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
1020 | |
1021 RET | |
1022 | |
1023 ;----------------------------------------------------------------------------- | |
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1024 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1025 ;----------------------------------------------------------------------------- |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1026 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1027 %macro SCATTER_WHT 1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1028 pextrw r1d, m0, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1029 pextrw r2d, m1, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1030 mov [r0+2*16*0], r1w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1031 mov [r0+2*16*1], r2w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1032 pextrw r1d, m2, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1033 pextrw r2d, m3, %1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1034 mov [r0+2*16*2], r1w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1035 mov [r0+2*16*3], r2w |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1036 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1037 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1038 %macro HADAMARD4_1D 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1039 SUMSUB_BADC m%2, m%1, m%4, m%3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1040 SUMSUB_BADC m%4, m%2, m%3, m%1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1041 SWAP %1, %4, %3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1042 %endmacro |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1043 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1044 INIT_MMX |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1045 cglobal vp8_luma_dc_wht_mmxext, 2,3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1046 movq m0, [r1] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1047 movq m1, [r1+8] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1048 movq m2, [r1+16] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1049 movq m3, [r1+24] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1050 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1051 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1052 paddw m0, [pw_3] |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1053 HADAMARD4_1D 0, 1, 2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1054 psraw m0, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1055 psraw m1, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1056 psraw m2, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1057 psraw m3, 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1058 SCATTER_WHT 0 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1059 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1060 SCATTER_WHT 1 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1061 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1062 SCATTER_WHT 2 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1063 add r0, 2*16*4 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1064 SCATTER_WHT 3 |
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1065 RET |