annotate x86/vp8dsp.asm @ 12266:48d6738904a9 libavcodec

Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this splits it into small optimization-specific macros which are selected for each DSP function. The advantage of this approach is that the sse4 functions now use the ssse3 codepath also without needing an explicit sse4 codepath.
author rbultje
date Sat, 24 Jul 2010 19:33:05 +0000
parents c7f6ddcc5c01
children 259988e7ad0f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
2 ;* VP8 MMXEXT optimizations
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
5 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
6 ;* This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
7 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
8 ;* FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
9 ;* modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
10 ;* License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
11 ;* version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
12 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
13 ;* FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
16 ;* Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
17 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
18 ;* You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
19 ;* License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
21 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
22
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
23 %include "x86inc.asm"
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
24 %include "x86util.asm"
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
25
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
26 SECTION_RODATA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
27
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
28 fourtap_filter_hw_m: times 4 dw -6, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
29 times 4 dw 12, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
30 times 4 dw -9, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
31 times 4 dw 50, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
32 times 4 dw -6, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
33 times 4 dw 93, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
34 times 4 dw -1, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
35 times 4 dw 123, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
37 sixtap_filter_hw_m: times 4 dw 2, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
38 times 4 dw 108, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
39 times 4 dw -8, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
40 times 4 dw 3, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
41 times 4 dw 77, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
42 times 4 dw -16, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
43 times 4 dw 1, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
44 times 4 dw 36, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
45 times 4 dw -11, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
46
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
47 fourtap_filter_hb_m: times 8 db -6, 123
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
48 times 8 db 12, -1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
49 times 8 db -9, 93
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
50 times 8 db 50, -6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
51 times 8 db -6, 50
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
52 times 8 db 93, -9
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
53 times 8 db -1, 12
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
54 times 8 db 123, -6
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
55
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
56 sixtap_filter_hb_m: times 8 db 2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
57 times 8 db -11, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
58 times 8 db 36, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
59 times 8 db 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
60 times 8 db -16, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
61 times 8 db 77, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
62 times 8 db 1, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
63 times 8 db -8, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
64 times 8 db 108, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
65
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
66 fourtap_filter_v_m: times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
67 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
68 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
69 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
70 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
71 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
72 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
73 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
74 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
75 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
76 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
77 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
78 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
79 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
80 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
81 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
82
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
83 sixtap_filter_v_m: times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
84 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
85 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
86 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
87 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
88 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
89 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
90 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
91 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
92 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
93 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
94 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
95 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
96 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
97 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
98 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
99 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
100 times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
101
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
102 bilinear_filter_vw_m: times 8 dw 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
103 times 8 dw 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
104 times 8 dw 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
105 times 8 dw 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
106 times 8 dw 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
107 times 8 dw 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
108 times 8 dw 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
109
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
110 bilinear_filter_vb_m: times 8 db 7, 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
111 times 8 db 6, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
112 times 8 db 5, 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
113 times 8 db 4, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
114 times 8 db 3, 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
115 times 8 db 2, 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
116 times 8 db 1, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
117
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
118 %ifdef PIC
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
119 %define fourtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
120 %define sixtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
121 %define fourtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
122 %define sixtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
123 %define fourtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
124 %define sixtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
125 %define bilinear_filter_vw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
126 %define bilinear_filter_vb r11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
127 %else
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
128 %define fourtap_filter_hw fourtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
129 %define sixtap_filter_hw sixtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
130 %define fourtap_filter_hb fourtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
131 %define sixtap_filter_hb sixtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
132 %define fourtap_filter_v fourtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
133 %define sixtap_filter_v sixtap_filter_v_m
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
134 %define bilinear_filter_vw bilinear_filter_vw_m
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
135 %define bilinear_filter_vb bilinear_filter_vb_m
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
136 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
137
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
140
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
144
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
145 pw_20091: times 4 dw 20091
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
146 pw_17734: times 4 dw 17734
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
147
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
148 cextern pb_1
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
149 cextern pw_3
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
150 cextern pb_3
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
151 cextern pw_4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
152 cextern pb_4
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
153 cextern pw_9
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
154 cextern pw_18
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
155 cextern pw_27
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
156 cextern pw_63
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
157 cextern pw_64
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
158 cextern pb_80
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
159 cextern pb_F8
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
160 cextern pb_FE
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
161
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
162 SECTION .text
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
163
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
164 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
165 ; subpel MC functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
166 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
167 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
168 ; uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
169 ; int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
170 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
171
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
172 %macro FILTER_SSSE3 3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
173 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
174 lea r5d, [r5*3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
175 mova m3, [filter_h6_shuf2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
176 mova m4, [filter_h6_shuf3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
177 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
178 lea r11, [sixtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
179 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
180 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
181 mova m6, [sixtap_filter_hb+r5*8-32]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
182 mova m7, [sixtap_filter_hb+r5*8-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
183
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
184 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
185 movu m0, [r2-2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
186 mova m1, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
187 mova m2, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
188 %ifidn %1, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
189 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
190 ; shuffle with a memory operand
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
191 punpcklbw m0, [r2+3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
192 %else
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
193 pshufb m0, [filter_h6_shuf1]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
194 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
195 pshufb m1, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
196 pshufb m2, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
197 pmaddubsw m0, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
198 pmaddubsw m1, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
199 pmaddubsw m2, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
200 paddsw m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
201 paddsw m0, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
202 paddsw m0, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
203 psraw m0, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
204 packuswb m0, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
205 movh [r0], m0 ; store
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
206
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
207 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
208 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
209 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
210 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
211 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
212 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
213
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
214 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
215 shl r5d, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
216 mova m2, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
217 mova m3, [filter_h2_shuf]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
218 mova m4, [filter_h4_shuf]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
219 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
220 lea r11, [fourtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
221 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
222 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
223 mova m6, [fourtap_filter_hb+r5]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
224
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
225 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
226 movu m0, [r2-1]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
227 mova m1, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
228 pshufb m0, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
229 pshufb m1, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
230 pmaddubsw m0, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
231 pmaddubsw m1, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
232 paddsw m0, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
233 paddsw m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
234 psraw m0, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
235 packuswb m0, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
236 movh [r0], m0 ; store
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
237
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
238 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
239 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
240 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
241 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
242 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
243 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
244
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
245 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
246 shl r6d, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
247 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
248 lea r11, [fourtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
249 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
250 mova m5, [fourtap_filter_hb+r6-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
251 mova m6, [fourtap_filter_hb+r6]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
252 mova m7, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
253
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
254 ; read 3 lines
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
255 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
256 movh m0, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
257 movh m1, [r2+ r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
258 movh m2, [r2+2*r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
259 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
260
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
261 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
262 movh m3, [r2+2*r3] ; read new row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
263 mova m4, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
264 mova m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
265 punpcklbw m4, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
266 mova m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
267 punpcklbw m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
268 pmaddubsw m4, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
269 pmaddubsw m2, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
270 paddsw m4, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
271 mova m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
272 paddsw m4, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
273 psraw m4, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
274 packuswb m4, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
275 movh [r0], m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
276
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
277 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
278 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
279 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
280 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
281 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
282 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
283
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
284 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
285 lea r6d, [r6*3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
286 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
287 lea r11, [sixtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
288 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
289 lea r6, [sixtap_filter_hb+r6*8]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
290
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
291 ; read 5 lines
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
292 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
293 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
294 movh m0, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
295 movh m1, [r2+r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
296 movh m2, [r2+r3*2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
297 lea r2, [r2+r3*2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
298 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
299 movh m3, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
300 movh m4, [r2+r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
301
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
302 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
303 movh m5, [r2+2*r3] ; read new row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
304 mova m6, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
305 punpcklbw m6, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
306 mova m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
307 punpcklbw m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
308 mova m7, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
309 punpcklbw m7, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
310 pmaddubsw m6, [r6-48]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
311 pmaddubsw m1, [r6-32]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
312 pmaddubsw m7, [r6-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
313 paddsw m6, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
314 paddsw m6, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
315 mova m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
316 paddsw m6, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
317 mova m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
318 psraw m6, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
319 mova m3, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
320 packuswb m6, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
321 mova m4, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
322 movh [r0], m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
323
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
324 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
325 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
326 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
327 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
328 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
329 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
330 %endmacro
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
331
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
332 INIT_MMX
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
333 FILTER_SSSE3 4, 0, 0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
334 INIT_XMM
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
335 FILTER_SSSE3 8, 8, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
336
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
337 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
338 cglobal put_vp8_epel4_h4_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
339 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
340 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
341 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
342 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
343 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
344 movq mm5, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
345 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
346 pxor mm6, mm6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
347
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
348 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
349 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
350
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
351 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
352 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
353 punpcklbw mm1, mm6 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
354 pshufw mm0, mm2, 9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
355 punpcklbw mm0, mm6 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
356 pshufw mm3, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
357 pshufw mm1, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
358 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
359 movq mm0, mm1 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
360 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
361 paddd mm3, mm1 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
362
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
363 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
364 punpckhbw mm2, mm6 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
365 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
366 pshufw mm1, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
367 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
368 paddd mm0, mm1 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
369
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
370 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
371 packssdw mm3, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
372 paddsw mm3, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
373 psraw mm3, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
374 packuswb mm3, mm6 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
375 movd [r0], mm3 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
376
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
377 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
378 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
379 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
380 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
381 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
382 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
383
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
384 ; 4x4 block, H-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
385 cglobal put_vp8_epel4_h6_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
386 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
387 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
388 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
389 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
390 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
391 movq mm5, [sixtap_filter_hw+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
392 movq mm6, [sixtap_filter_hw+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
393 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
394 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
395
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
396 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
397 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
398
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
399 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
400 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
401 punpcklbw mm1, mm3 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
402 pshufw mm0, mm2, 0x9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
403 punpckhbw mm2, mm3 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
404 punpcklbw mm0, mm3 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
405 pshufw mm1, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
406 pshufw mm2, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
407 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
408 pshufw mm3, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
409 movq mm0, mm3 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
410 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
411 paddd mm1, mm3 ; add to 1st 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
412 movq mm3, mm2 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
413 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
414 paddd mm1, mm2 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
415
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
416 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
417 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
418 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
419 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
420 paddd mm0, mm3 ; add to 2nd 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
421 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
422 punpcklbw mm2, mm3 ; byte->word FGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
423 pshufw mm2, mm2, 0xE9 ; word GHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
424 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
425 paddd mm0, mm2 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
426
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
427 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
428 packssdw mm1, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
429 paddsw mm1, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
430 psraw mm1, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
431 packuswb mm1, mm3 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
432 movd [r0], mm1 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
433
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
434 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
435 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
436 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
437 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
438 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
439 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
440
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
441 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
442 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
443 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
444 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
445 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
446 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
447 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
448 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
449 mova m6, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
450 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
451
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
452 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
453 movh m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
454 punpcklbw m0, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
455 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
456 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
457 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
458 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
459 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
460 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
461 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
462 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
463 pmaddwd m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
464 pmaddwd m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
465 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
466
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
467 movh m1, [r2+3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
468 punpcklbw m1, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
469 mova m2, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
470 mova m3, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
471 mova m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
472 psrldq m2, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
473 psrldq m3, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
474 psrldq m4, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
475 punpcklwd m1, m2 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
476 punpcklwd m3, m4 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
477 pmaddwd m1, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
478 pmaddwd m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
479 paddd m1, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
480
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
481 packssdw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
482 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
483 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
484 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
485 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
486
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
487 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
488 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
489 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
490 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
491 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
492 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
493
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
494 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
495 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
496 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
497 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
498 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
499 lea r5, [sixtap_filter_hw+r5*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
500 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
501
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
502 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
503 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
504 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
505 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
506 punpcklbw m0, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
507 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
508 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
509 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
510 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
511 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
512 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
513 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
514 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
515 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
516 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
517 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
518 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
519 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
520 pmaddwd m0, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
521 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
522 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
523 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
524 paddd m0, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
525
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
526 psrldq m6, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
527 mova m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
528 punpcklbw m6, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
529 mova m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
530 mova m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
531 mova m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
532 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
533 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
534 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
535 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
536 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
537 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
538 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
539 punpcklwd m6, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
540 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
541 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
542 pmaddwd m6, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
543 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
544 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
545 paddd m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
546 paddd m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
547
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
548 packssdw m0, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
549 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
550 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
551 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
552 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
553
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
554 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
555 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
556 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
557 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
558 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
559 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
560
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
561 %macro FILTER_V 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
562 ; 4x4 block, V-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
563 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
564 shl r6d, 5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
565 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
566 lea r11, [fourtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
567 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
568 lea r6, [fourtap_filter_v+r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
569 mova m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
570 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
571 mova m5, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
572
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
573 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
574 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
575 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
576 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
577 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
578 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
579 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
580 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
581 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
582
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
583 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
584 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
585 movh m4, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
586 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
587 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
588 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
589 pmullw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
590 paddsw m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
591
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
592 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
593 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
594 pmullw m1, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
595 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
596 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
597 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
598 paddsw m4, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
599 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
600
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
601 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
602 paddsw m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
603 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
604 packuswb m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
605 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
606
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
607 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
608 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
609 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
610 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
611 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
612 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
613
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
614
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
615 ; 4x4 block, V-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
616 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
617 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
618 lea r6, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
619 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
620 lea r11, [sixtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
621 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
622 lea r6, [sixtap_filter_v+r6-96]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
623 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
624
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
625 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
626 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
627 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
628 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
629 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
630 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
631 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
632 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
633 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
634 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
635 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
636 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
637 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
638 punpcklbw m3, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
639 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
640
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
641 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
642 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
643 mova m5, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
644 pmullw m5, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
645 mova m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
646 pmullw m6, [r6+64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
647 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
648
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
649 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
650 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
651 punpcklbw m5, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
652 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
653 paddsw m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
654 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
655 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
656 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
657 paddsw m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
658 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
659 pmullw m3, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
660 paddsw m6, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
661 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
662 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
663 pmullw m5, [r6+80]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
664 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
665
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
666 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
667 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
668 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
669 packuswb m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
670 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
671
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
672 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
673 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
674 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
675 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
676 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
677 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
678 %endmacro
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
679
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
680 INIT_MMX
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
681 FILTER_V mmxext, 4, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
682 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
683 FILTER_V sse2, 8, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
684
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
685 %macro FILTER_BILINEAR 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
686 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
687 mov r5d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
688 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
689 sub r5d, r6d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
690 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
691 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
692 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
693 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
694 mova m4, [bilinear_filter_vw+r5-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
695 mova m5, [bilinear_filter_vw+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
696 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
697 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
698 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
699 movh m3, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
700 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
701 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
702 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
703 mova m2, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
704 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
705 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
706 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
707 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
708 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
709 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
710 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
711 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
712 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
713 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
714 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
715 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
716 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
717 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
718 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
719 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
720 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
721 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
722 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
723 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
724
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
725 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
726 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
727 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
728 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
729 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
730
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
731 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
732 mov r6d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
733 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
734 sub r6d, r5d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
735 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
736 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
737 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
738 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
739 mova m4, [bilinear_filter_vw+r6-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
740 mova m5, [bilinear_filter_vw+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
741 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
742 movh m0, [r2+r3*0+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
743 movh m1, [r2+r3*0+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
744 movh m2, [r2+r3*1+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
745 movh m3, [r2+r3*1+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
746 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
747 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
748 punpcklbw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
749 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
750 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
751 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
752 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
753 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
754 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
755 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
756 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
757 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
758 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
759 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
760 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
761 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
762 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
763 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
764 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
765 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
766 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
767 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
768 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
769 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
770
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
771 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
772 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
773 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
774 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
775 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
776 %endmacro
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
777
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
778 INIT_MMX
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
779 FILTER_BILINEAR mmxext, 4, 0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
780 INIT_XMM
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
781 FILTER_BILINEAR sse2, 8, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
782
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
783 %macro FILTER_BILINEAR_SSSE3 1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
784 cglobal put_vp8_bilinear%1_v_ssse3, 7,7
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
785 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
786 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
787 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
788 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
789 pxor m4, m4
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
790 mova m3, [bilinear_filter_vb+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
791 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
792 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
793 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
794 movh m2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
795 punpcklbw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
796 punpcklbw m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
797 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
798 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
799 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
800 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
801 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
802 pavgw m1, m4
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
803 %if mmsize==8
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
804 packuswb m0, m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
805 packuswb m1, m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
806 movh [r0+r1*0], m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
807 movh [r0+r1*1], m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
808 %else
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
809 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
810 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
811 movhps [r0+r1*1], m0
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
812 %endif
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
813
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
814 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
815 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
816 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
817 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
818 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
819
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
820 cglobal put_vp8_bilinear%1_h_ssse3, 7,7
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
821 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
822 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
823 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
824 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
825 pxor m4, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
826 mova m2, [filter_h2_shuf]
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
827 mova m3, [bilinear_filter_vb+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
828 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
829 movu m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
830 movu m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
831 pshufb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
832 pshufb m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
833 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
834 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
835 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
836 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
837 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
838 pavgw m1, m4
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
839 %if mmsize==8
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
840 packuswb m0, m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
841 packuswb m1, m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
842 movh [r0+r1*0], m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
843 movh [r0+r1*1], m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
844 %else
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
845 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
846 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
847 movhps [r0+r1*1], m0
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
848 %endif
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
849
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
850 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
851 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
852 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
853 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
854 REP_RET
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
855 %endmacro
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
856
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
857 INIT_MMX
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
858 FILTER_BILINEAR_SSSE3 4
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
859 INIT_XMM
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
860 FILTER_BILINEAR_SSSE3 8
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
861
11992
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
862 cglobal put_vp8_pixels8_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
863 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
864 movq mm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
865 movq mm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
866 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
867 movq [r0+r1*0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
868 movq [r0+r1*1], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
869 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
870 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
871 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
872 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
873
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
874 cglobal put_vp8_pixels16_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
875 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
876 movq mm0, [r2+r3*0+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
877 movq mm1, [r2+r3*0+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
878 movq mm2, [r2+r3*1+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
879 movq mm3, [r2+r3*1+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
880 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
881 movq [r0+r1*0+0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
882 movq [r0+r1*0+8], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
883 movq [r0+r1*1+0], mm2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
884 movq [r0+r1*1+8], mm3
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
885 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
886 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
887 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
888 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
889
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
890 cglobal put_vp8_pixels16_sse, 5,5,2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
891 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
892 movups xmm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
893 movups xmm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
894 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
895 movaps [r0+r1*0], xmm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
896 movaps [r0+r1*1], xmm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
897 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
898 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
899 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
900 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
901
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
902 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
903 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
904 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
905
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
906 %macro ADD_DC 4
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
907 %4 m2, [r0+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
908 %4 m3, [r0+r2+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
909 %4 m4, [r1+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
910 %4 m5, [r1+r2+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
911 paddusb m2, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
912 paddusb m3, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
913 paddusb m4, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
914 paddusb m5, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
915 psubusb m2, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
916 psubusb m3, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
917 psubusb m4, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
918 psubusb m5, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
919 %4 [r0+%3], m2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
920 %4 [r0+r2+%3], m3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
921 %4 [r1+%3], m4
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
922 %4 [r1+r2+%3], m5
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
923 %endmacro
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
924
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
925 INIT_MMX
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
926 cglobal vp8_idct_dc_add_mmx, 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
927 ; load data
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
928 movd m0, [r1]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
929
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
930 ; calculate DC
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
931 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
932 pxor m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
933 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
934 movd [r1], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
935 psubw m1, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
936 packuswb m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
937 packuswb m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
938 punpcklbw m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
939 punpcklbw m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
940 punpcklwd m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
941 punpcklwd m1, m1
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
942
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
943 ; add DC
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
944 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
945 ADD_DC m0, m1, 0, movh
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
946 RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
947
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
948 INIT_XMM
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
949 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
950 ; load data
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
951 movd m0, [r1]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
952 pxor m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
953
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
954 ; calculate DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
955 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
956 movd [r1], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
957 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
958 movd m2, [r0]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
959 movd m3, [r0+r2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
960 movd m4, [r1]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
961 movd m5, [r1+r2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
962 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
963 pshuflw m0, m0, 0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
964 punpcklqdq m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
965 punpckldq m2, m3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
966 punpckldq m4, m5
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
967 punpcklbw m2, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
968 punpcklbw m4, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
969 paddw m2, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
970 paddw m4, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
971 packuswb m2, m4
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
972 movd [r0], m2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
973 pextrd [r0+r2], m2, 1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
974 pextrd [r1], m2, 2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
975 pextrd [r1+r2], m2, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
976 RET
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
977
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
978 ;-----------------------------------------------------------------------------
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
980 ;-----------------------------------------------------------------------------
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
981
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
982 INIT_MMX
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
983 cglobal vp8_idct_dc_add4y_mmx, 3, 3
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
984 ; load data
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
985 movd m0, [r1+32*0] ; A
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
986 movd m1, [r1+32*2] ; C
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
987 punpcklwd m0, [r1+32*1] ; A B
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
988 punpcklwd m1, [r1+32*3] ; C D
12239
13b1ad24a4b1 VP8 asm: cosmetics (spacing)
darkshikari
parents: 12238
diff changeset
989 punpckldq m0, m1 ; A B C D
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
990 pxor m6, m6
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
991
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
992 ; calculate DC
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
993 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
994 movd [r1+32*0], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
995 movd [r1+32*1], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
996 movd [r1+32*2], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
997 movd [r1+32*3], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
998 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
999 psubw m6, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1000 packuswb m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1001 packuswb m6, m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1002 punpcklbw m0, m0 ; AABBCCDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1003 punpcklbw m6, m6 ; AABBCCDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1004 movq m1, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1005 movq m7, m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1006 punpcklbw m0, m0 ; AAAABBBB
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1007 punpckhbw m1, m1 ; CCCCDDDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1008 punpcklbw m6, m6 ; AAAABBBB
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1009 punpckhbw m7, m7 ; CCCCDDDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1010
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1011 ; add DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1012 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1013 ADD_DC m0, m6, 0, mova
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1014 ADD_DC m1, m7, 8, mova
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1015 RET
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1016
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1017 INIT_XMM
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1018 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1019 ; load data
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1020 movd m0, [r1+32*0] ; A
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1021 movd m1, [r1+32*2] ; C
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1022 punpcklwd m0, [r1+32*1] ; A B
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1023 punpcklwd m1, [r1+32*3] ; C D
12239
13b1ad24a4b1 VP8 asm: cosmetics (spacing)
darkshikari
parents: 12238
diff changeset
1024 punpckldq m0, m1 ; A B C D
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1025 pxor m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1026
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1027 ; calculate DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1028 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1029 movd [r1+32*0], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1030 movd [r1+32*1], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1031 movd [r1+32*2], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1032 movd [r1+32*3], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1033 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1034 psubw m1, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1035 packuswb m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1036 packuswb m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1037 punpcklbw m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1038 punpcklbw m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1039 punpcklbw m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1040 punpcklbw m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1041
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1042 ; add DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1043 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1044 ADD_DC m0, m1, 0, mova
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1045 RET
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1046
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1047 ;-----------------------------------------------------------------------------
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1048 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1049 ;-----------------------------------------------------------------------------
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1050
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1051 INIT_MMX
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1052 cglobal vp8_idct_dc_add4uv_mmx, 3, 3
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1053 ; load data
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1054 movd m0, [r1+32*0] ; A
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1055 movd m1, [r1+32*2] ; C
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1056 punpcklwd m0, [r1+32*1] ; A B
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1057 punpcklwd m1, [r1+32*3] ; C D
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1058 punpckldq m0, m1 ; A B C D
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1059 pxor m6, m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1060
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1061 ; calculate DC
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1062 paddw m0, [pw_4]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1063 movd [r1+32*0], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1064 movd [r1+32*1], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1065 movd [r1+32*2], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1066 movd [r1+32*3], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1067 psraw m0, 3
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1068 psubw m6, m0
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1069 packuswb m0, m0
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1070 packuswb m6, m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1071 punpcklbw m0, m0 ; AABBCCDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1072 punpcklbw m6, m6 ; AABBCCDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1073 movq m1, m0
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1074 movq m7, m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1075 punpcklbw m0, m0 ; AAAABBBB
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1076 punpckhbw m1, m1 ; CCCCDDDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1077 punpcklbw m6, m6 ; AAAABBBB
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1078 punpckhbw m7, m7 ; CCCCDDDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1079
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1080 ; add DC
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1081 lea r1, [r0+r2*2]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1082 ADD_DC m0, m6, 0, mova
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1083 lea r0, [r0+r2*4]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1084 lea r1, [r1+r2*4]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1085 ADD_DC m1, m7, 0, mova
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1086 RET
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1087
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1088 ;-----------------------------------------------------------------------------
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1089 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1090 ;-----------------------------------------------------------------------------
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1091
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1092 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1093 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1094 %macro VP8_MULTIPLY_SUMSUB 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1095 mova %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1096 mova %4, %2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1097 pmulhw %3, m6 ;20091(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1098 pmulhw %4, m6 ;20091(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1099 paddw %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1100 paddw %4, %2
12018
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
1101 paddw %1, %1
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
1102 paddw %2, %2
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1103 pmulhw %1, m7 ;35468(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1104 pmulhw %2, m7 ;35468(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1105 psubw %1, %4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1106 paddw %2, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1107 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1108
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1109 ; calculate x0=%1+%3; x1=%1-%3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1110 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1111 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1112 ; %5/%6 are temporary registers
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1113 ; we assume m6/m7 have constant words 20091/17734 loaded in them
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1114 %macro VP8_IDCT_TRANSFORM4x4_1D 6
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1115 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1116 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1117 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1118 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1119 SWAP %4, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1120 SWAP %4, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1121 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1122
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1123 INIT_MMX
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1124 %macro VP8_IDCT_ADD 1
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1125 cglobal vp8_idct_add_%1, 3, 3
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1126 ; load block data
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1127 movq m0, [r1+ 0]
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1128 movq m1, [r1+ 8]
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1129 movq m2, [r1+16]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1130 movq m3, [r1+24]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1131 movq m6, [pw_20091]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1132 movq m7, [pw_17734]
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1133 %ifidn %1, sse
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1134 xorps xmm0, xmm0
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1135 movaps [r1+ 0], xmm0
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1136 movaps [r1+16], xmm0
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1137 %else
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1138 pxor m4, m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1139 movq [r1+ 0], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1140 movq [r1+ 8], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1141 movq [r1+16], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1142 movq [r1+24], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1143 %endif
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1144
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1145 ; actual IDCT
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1146 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1147 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1148 paddw m0, [pw_4]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1149 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1150 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1151
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1152 ; store
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1153 pxor m4, m4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1154 lea r1, [r0+2*r2]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1155 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1156 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1157
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1158 RET
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1159 %endmacro
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1160
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1161 VP8_IDCT_ADD mmx
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1162 VP8_IDCT_ADD sse
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1163
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1164 ;-----------------------------------------------------------------------------
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1165 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1166 ;-----------------------------------------------------------------------------
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1167
12209
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1168 %macro SCATTER_WHT 3
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1169 movd r1d, m%1
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1170 movd r2d, m%2
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1171 mov [r0+2*16*(0+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1172 mov [r0+2*16*(1+%3)], r2w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1173 shr r1d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1174 shr r2d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1175 psrlq m%1, 32
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1176 psrlq m%2, 32
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1177 mov [r0+2*16*(4+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1178 mov [r0+2*16*(5+%3)], r2w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1179 movd r1d, m%1
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1180 movd r2d, m%2
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1181 mov [r0+2*16*(8+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1182 mov [r0+2*16*(9+%3)], r2w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1183 shr r1d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1184 shr r2d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1185 mov [r0+2*16*(12+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1186 mov [r0+2*16*(13+%3)], r2w
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1187 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1188
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1189 %macro HADAMARD4_1D 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1190 SUMSUB_BADC m%2, m%1, m%4, m%3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1191 SUMSUB_BADC m%4, m%2, m%3, m%1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1192 SWAP %1, %4, %3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1193 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1194
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1195 INIT_MMX
12209
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1196 cglobal vp8_luma_dc_wht_mmx, 2,3
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1197 movq m0, [r1]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1198 movq m1, [r1+8]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1199 movq m2, [r1+16]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1200 movq m3, [r1+24]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1201 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1202 TRANSPOSE4x4W 0, 1, 2, 3, 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1203 paddw m0, [pw_3]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1204 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1205 psraw m0, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1206 psraw m1, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1207 psraw m2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1208 psraw m3, 3
12209
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1209 SCATTER_WHT 0, 1, 0
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1210 SCATTER_WHT 2, 3, 2
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1211 RET
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1212
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1213 ;-----------------------------------------------------------------------------
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1214 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1215 ;-----------------------------------------------------------------------------
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1216
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1217 ; macro called with 7 mm register indexes as argument, and 4 regular registers
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1218 ;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1219 ; first 4 mm registers will carry the transposed pixel data
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1220 ; the other three are scratchspace (one would be sufficient, but this allows
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1221 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1222 ;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1223 ; first two regular registers are buf+4*stride and buf+5*stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1224 ; third is -stride, fourth is +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1225 %macro READ_8x4_INTERLEAVED 11
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1226 ; interleave 8 (A-H) rows of 4 pixels each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1227 movd m%1, [%8+%10*4] ; A0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1228 movd m%5, [%9+%10*4] ; B0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1229 movd m%2, [%8+%10*2] ; C0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1230 movd m%6, [%8+%10] ; D0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1231 movd m%3, [%8] ; E0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1232 movd m%7, [%9] ; F0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1233 movd m%4, [%9+%11] ; G0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1234 punpcklbw m%1, m%5 ; A/B interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1235 movd m%5, [%9+%11*2] ; H0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1236 punpcklbw m%2, m%6 ; C/D interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1237 punpcklbw m%3, m%7 ; E/F interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1238 punpcklbw m%4, m%5 ; G/H interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1239 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1240
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1241 ; macro called with 7 mm register indexes as argument, and 5 regular registers
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1242 ; first 11 mean the same as READ_8x4_TRANSPOSED above
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1243 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1244 ; will be set to second regular register + 8*stride at the end
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1245 %macro READ_16x4_INTERLEAVED 12
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1246 ; transpose 16 (A-P) rows of 4 pixels each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1247 lea %12, [r0+8*r2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1248
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1249 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1250 movd m%1, [%8+%10*4] ; A0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1251 movd m%3, [%12+%10*4] ; I0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1252 movd m%2, [%8+%10*2] ; C0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1253 movd m%4, [%12+%10*2] ; K0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1254 movd m%6, [%8+%10] ; D0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1255 movd m%5, [%12+%10] ; L0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1256 movd m%7, [%12] ; M0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1257 add %12, %11
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1258 punpcklbw m%1, m%3 ; A/I
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1259 movd m%3, [%8] ; E0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1260 punpcklbw m%2, m%4 ; C/K
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1261 punpcklbw m%6, m%5 ; D/L
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1262 punpcklbw m%3, m%7 ; E/M
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1263 punpcklbw m%2, m%6 ; C/D/K/L interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1264
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1265 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1266 movd m%5, [%9+%10*4] ; B0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1267 movd m%4, [%12+%10*4] ; J0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1268 movd m%7, [%9] ; F0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1269 movd m%6, [%12] ; N0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1270 punpcklbw m%5, m%4 ; B/J
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1271 punpcklbw m%7, m%6 ; F/N
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1272 punpcklbw m%1, m%5 ; A/B/I/J interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1273 punpcklbw m%3, m%7 ; E/F/M/N interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1274 movd m%4, [%9+%11] ; G0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1275 movd m%6, [%12+%11] ; O0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1276 movd m%5, [%9+%11*2] ; H0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1277 movd m%7, [%12+%11*2] ; P0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1278 punpcklbw m%4, m%6 ; G/O
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1279 punpcklbw m%5, m%7 ; H/P
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1280 punpcklbw m%4, m%5 ; G/H/O/P interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1281 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1282
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1283 ; write 4 mm registers of 2 dwords each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1284 ; first four arguments are mm register indexes containing source data
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1285 ; last four are registers containing buf+4*stride, buf+5*stride,
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1286 ; -stride and +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1287 %macro WRITE_4x2D 8
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1288 ; write out (2 dwords per register)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1289 movd [%5+%7*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1290 movd [%5+%7*2], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1291 movd [%5], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1292 movd [%6+%8], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1293 punpckhdq m%1, m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1294 punpckhdq m%2, m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1295 punpckhdq m%3, m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1296 punpckhdq m%4, m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1297 movd [%6+%7*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1298 movd [%5+%7], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1299 movd [%6], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1300 movd [%6+%8*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1301 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1302
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1303 ; write 4 xmm registers of 4 dwords each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1304 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1305 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1306 ; we add 1*stride to the third regular registry in the process
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1307 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1308 ; same memory region), or 8 if they cover two separate buffers (third one points to
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1309 ; a different memory region than the first two), allowing for more optimal code for
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1310 ; the 16-width case
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1311 %macro WRITE_4x4D 10
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1312 ; write out (4 dwords per register), start with dwords zero
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1313 movd [%5+%8*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1314 movd [%5], m%2
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1315 movd [%7+%8*4], m%3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1316 movd [%7], m%4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1317
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1318 ; store dwords 1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1319 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1320 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1321 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1322 psrldq m%4, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1323 movd [%6+%8*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1324 movd [%6], m%2
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1325 %if %10 == 16
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1326 movd [%6+%9*4], m%3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1327 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1328 movd [%7+%9], m%4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1329
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1330 ; write dwords 2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1331 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1332 psrldq m%2, 4
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1333 %if %10 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1334 movd [%5+%8*2], m%1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1335 movd %5, m%3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1336 %endif
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1337 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1338 psrldq m%4, 4
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1339 %if %10 == 16
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1340 movd [%5+%8*2], m%1
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1341 %endif
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1342 movd [%6+%9], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1343 movd [%7+%8*2], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1344 movd [%7+%9*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1345 add %7, %9
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1346
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1347 ; store dwords 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1348 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1349 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1350 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1351 psrldq m%4, 4
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1352 %if %10 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1353 mov [%7+%8*4], %5d
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1354 movd [%6+%8*2], m%1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1355 %else
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1356 movd [%5+%8], m%1
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1357 %endif
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1358 movd [%6+%9*2], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1359 movd [%7+%8*2], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1360 movd [%7+%9*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1361 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1362
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1363 %macro SPLATB_REG_MMX 2-3
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1364 movd %1, %2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1365 punpcklbw %1, %1
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1366 punpcklwd %1, %1
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1367 punpckldq %1, %1
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1368 %endmacro
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1369
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1370 %macro SPLATB_REG_MMXEXT 2-3
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1371 movd %1, %2
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1372 punpcklbw %1, %1
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1373 pshufw %1, %1, 0x0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1374 %endmacro
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1375
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1376 %macro SPLATB_REG_SSE2 2-3
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1377 movd %1, %2
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1378 punpcklbw %1, %1
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1379 pshuflw %1, %1, 0x0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1380 punpcklqdq %1, %1
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1381 %endmacro
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1382
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1383 %macro SPLATB_REG_SSSE3 3
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1384 movd %1, %2
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1385 pshufb %1, %3
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1386 %endmacro
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1387
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1388 %macro SIMPLE_LOOPFILTER 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1389 cglobal vp8_%2_loop_filter_simple_%1, 3, %3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1390 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1391 mov r5, rsp ; backup stack pointer
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1392 and rsp, ~(mmsize-1) ; align stack
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1393 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1394 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1395 mov r3, 2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1396 %endif
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1397 %ifnidn %1, sse2 && mmsize == 16
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1398 pxor m0, m0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1399 %endif
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1400 SPLATB_REG m7, r2, m0 ; splat "flim" into register
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1401
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1402 ; set up indexes to address 4 rows
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1403 mov r2, r1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1404 neg r1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1405 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1406 lea r0, [r0+4*r2-2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1407 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1408 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1409
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1410 %if mmsize == 8 ; mmx / mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1411 .next8px
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1412 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1413 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1414 ; read 4 half/full rows of pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1415 mova m0, [r0+r1*2] ; p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1416 mova m1, [r0+r1] ; p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1417 mova m2, [r0] ; q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1418 mova m3, [r0+r2] ; q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1419 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1420 lea r4, [r0+r2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1421
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1422 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1423 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1424 %else ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1425 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1426 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1427 TRANSPOSE4x4W 0, 1, 2, 3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1428
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1429 mova [rsp], m0 ; store p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1430 mova [rsp+mmsize], m3 ; store q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1431 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1432
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1433 ; simple_limit
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1434 mova m5, m2 ; m5=backup of q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1435 mova m6, m1 ; m6=backup of p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1436 psubusb m1, m2 ; p0-q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1437 psubusb m2, m6 ; q0-p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1438 por m1, m2 ; FFABS(p0-q0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1439 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1440
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1441 mova m4, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1442 mova m2, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1443 psubusb m3, m0 ; q1-p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1444 psubusb m0, m4 ; p1-q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1445 por m3, m0 ; FFABS(p1-q1)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1446 mova m0, [pb_80]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1447 pxor m2, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1448 pxor m4, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1449 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1450 pand m3, [pb_FE]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1451 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1452 paddusb m3, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1453 psubusb m3, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1454 pxor m1, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1455 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1456
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1457 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1458 mova m4, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1459 pxor m5, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1460 pxor m0, m6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1461 psubsb m5, m0 ; q0-p0 (signed)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1462 paddsb m2, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1463 paddsb m2, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1464 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1465 pand m2, m3 ; apply filter mask (m3)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1466
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1467 mova m3, [pb_F8]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1468 mova m1, m2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1469 paddsb m2, [pb_4] ; f1<<3=a+4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1470 paddsb m1, [pb_3] ; f2<<3=a+3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1471 pand m2, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1472 pand m1, m3 ; cache f2<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1473
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1474 pxor m0, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1475 pxor m3, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1476 pcmpgtb m0, m2 ; which values are <0?
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1477 psubb m3, m2 ; -f1<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1478 psrlq m2, 3 ; +f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1479 psrlq m3, 3 ; -f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1480 pand m3, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1481 pandn m0, m2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1482 psubusb m4, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1483 paddusb m4, m3 ; q0-f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1484
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1485 pxor m0, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1486 pxor m3, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1487 pcmpgtb m0, m1 ; which values are <0?
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1488 psubb m3, m1 ; -f2<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1489 psrlq m1, 3 ; +f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1490 psrlq m3, 3 ; -f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1491 pand m3, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1492 pandn m0, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1493 paddusb m6, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1494 psubusb m6, m3 ; p0+f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1495
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1496 ; store
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1497 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1498 mova [r0], m4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1499 mova [r0+r1], m6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1500 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1501 mova m0, [rsp] ; p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1502 SWAP 2, 4 ; p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1503 SWAP 1, 6 ; q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1504 mova m3, [rsp+mmsize] ; q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1505
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1506 TRANSPOSE4x4B 0, 1, 2, 3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1507 %if mmsize == 16 ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1508 add r3, r1 ; change from r4*8*stride to r0+8*stride
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1509 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1510 %else ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1511 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1512 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1513 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1514
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1515 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1516 ; next 8 pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1517 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1518 add r0, 8 ; advance 8 cols = pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1519 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1520 lea r0, [r0+r2*8] ; advance 8 rows = lines
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1521 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1522 dec r3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1523 jg .next8px
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1524 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1525 REP_RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1526 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1527 mov rsp, r5 ; restore stack pointer
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1528 RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1529 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1530 %else ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1531 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1532 mov rsp, r5 ; restore stack pointer
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1533 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1534 RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1535 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1536 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1537
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1538 INIT_MMX
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1539 %define SPLATB_REG SPLATB_REG_MMX
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1540 SIMPLE_LOOPFILTER mmx, v, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1541 SIMPLE_LOOPFILTER mmx, h, 6
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1542 %define SPLATB_REG SPLATB_REG_MMXEXT
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1543 SIMPLE_LOOPFILTER mmxext, v, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1544 SIMPLE_LOOPFILTER mmxext, h, 6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1545 INIT_XMM
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1546 %define SPLATB_REG SPLATB_REG_SSE2
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1547 SIMPLE_LOOPFILTER sse2, v, 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1548 SIMPLE_LOOPFILTER sse2, h, 6
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1549 %define SPLATB_REG SPLATB_REG_SSSE3
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1550 SIMPLE_LOOPFILTER ssse3, v, 3
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1551 SIMPLE_LOOPFILTER ssse3, h, 6
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1552
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1553 ;-----------------------------------------------------------------------------
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1554 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1555 ; int flimE, int flimI, int hev_thr);
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1556 ;-----------------------------------------------------------------------------
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1557
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1558 %macro INNER_LOOPFILTER 5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1559 %if %4 == 8 ; chroma
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1560 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1561 %define dst8_reg r1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1562 %define mstride_reg r2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1563 %define E_reg r3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1564 %define I_reg r4
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1565 %define hev_thr_reg r5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1566 %else ; luma
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1567 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1568 %define mstride_reg r1
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1569 %define E_reg r2
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1570 %define I_reg r3
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1571 %define hev_thr_reg r4
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1572 %ifdef m8 ; x86-64, sse2
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1573 %define dst8_reg r4
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1574 %elif mmsize == 16 ; x86-32, sse2
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1575 %define dst8_reg r5
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1576 %else ; x86-32, mmx/mmxext
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1577 %define cnt_reg r5
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1578 %endif
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1579 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1580 %define dst_reg r0
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1581 %define stride_reg E_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1582 %define dst2_reg I_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1583 %ifndef m8
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1584 %define stack_reg hev_thr_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1585 %endif
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1586
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1587 %ifnidn %1, sse2 && mmsize == 16
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1588 pxor m7, m7
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1589 %endif
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1590
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1591 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1592 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1593 SPLATB_REG m0, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1594 SPLATB_REG m1, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1595 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1596
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1597 ; align stack
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1598 mov stack_reg, rsp ; backup stack pointer
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1599 and rsp, ~(mmsize-1) ; align stack
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1600 %ifidn %2, v
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1601 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1602 ; [3]=hev() result
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1603 %else ; h
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1604 sub rsp, mmsize * 5 ; extra storage space for transposes
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1605 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1606
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1607 %define flim_E [rsp]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1608 %define flim_I [rsp+mmsize]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1609 %define hev_thr [rsp+mmsize*2]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1610 %define mask_res [rsp+mmsize*3]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1611 %define p0backup [rsp+mmsize*3]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1612 %define q0backup [rsp+mmsize*4]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1613
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1614 mova flim_E, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1615 mova flim_I, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1616 mova hev_thr, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1617
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1618 %else ; sse2 on x86-64
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1619
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1620 %define flim_E m9
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1621 %define flim_I m10
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1622 %define hev_thr m11
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1623 %define mask_res m12
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1624 %define p0backup m12
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1625 %define q0backup m8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1626
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1627 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1628 SPLATB_REG flim_E, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1629 SPLATB_REG flim_I, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1630 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1631 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1632
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1633 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1634 mov cnt_reg, 2
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1635 %endif
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1636 mov stride_reg, mstride_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1637 neg mstride_reg
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1638 %ifidn %2, h
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1639 lea dst_reg, [dst_reg + stride_reg*4-4]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1640 %if %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1641 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1642 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1643 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1644
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1645 %if mmsize == 8
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1646 .next8px
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1647 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1648 ; read
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1649 lea dst2_reg, [dst_reg + stride_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1650 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1651 %if %4 == 8 && mmsize == 16
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1652 %define movrow movh
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1653 %else
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1654 %define movrow mova
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1655 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1656 movrow m0, [dst_reg +mstride_reg*4] ; p3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1657 movrow m1, [dst2_reg+mstride_reg*4] ; p2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1658 movrow m2, [dst_reg +mstride_reg*2] ; p1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1659 movrow m5, [dst2_reg] ; q1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1660 movrow m6, [dst2_reg+ stride_reg] ; q2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1661 movrow m7, [dst2_reg+ stride_reg*2] ; q3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1662 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1663 movhps m0, [dst8_reg+mstride_reg*4]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1664 movhps m2, [dst8_reg+mstride_reg*2]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1665 add dst8_reg, stride_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1666 movhps m1, [dst8_reg+mstride_reg*4]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1667 movhps m5, [dst8_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1668 movhps m6, [dst8_reg+ stride_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1669 movhps m7, [dst8_reg+ stride_reg*2]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1670 add dst8_reg, mstride_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1671 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1672 %elif mmsize == 8 ; mmx/mmxext (h)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1673 ; read 8 rows of 8px each
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1674 movu m0, [dst_reg +mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1675 movu m1, [dst2_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1676 movu m2, [dst_reg +mstride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1677 movu m3, [dst_reg +mstride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1678 movu m4, [dst_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1679 movu m5, [dst2_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1680 movu m6, [dst2_reg+ stride_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1681
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1682 ; 8x8 transpose
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1683 TRANSPOSE4x4B 0, 1, 2, 3, 7
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1684 mova q0backup, m1
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1685 movu m7, [dst2_reg+ stride_reg*2]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1686 TRANSPOSE4x4B 4, 5, 6, 7, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1687 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1688 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1689 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1690 mova m1, q0backup
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1691 mova q0backup, m2 ; store q0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1692 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1693 mova p0backup, m5 ; store p0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1694 SWAP 1, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1695 SWAP 2, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1696 SWAP 6, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1697 SWAP 5, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1698 %else ; sse2 (h)
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1699 %if %4 == 16
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1700 lea dst8_reg, [dst_reg + stride_reg*8]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1701 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1702
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1703 ; read 16 rows of 8px each, interleave
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1704 movh m0, [dst_reg +mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1705 movh m1, [dst8_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1706 movh m2, [dst_reg +mstride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1707 movh m5, [dst8_reg+mstride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1708 movh m3, [dst_reg +mstride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1709 movh m6, [dst8_reg+mstride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1710 movh m4, [dst_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1711 movh m7, [dst8_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1712 punpcklbw m0, m1 ; A/I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1713 punpcklbw m2, m5 ; C/K
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1714 punpcklbw m3, m6 ; D/L
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1715 punpcklbw m4, m7 ; E/M
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1716
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1717 add dst8_reg, stride_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1718 movh m1, [dst2_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1719 movh m6, [dst8_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1720 movh m5, [dst2_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1721 movh m7, [dst8_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1722 punpcklbw m1, m6 ; B/J
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1723 punpcklbw m5, m7 ; F/N
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1724 movh m6, [dst2_reg+ stride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1725 movh m7, [dst8_reg+ stride_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1726 punpcklbw m6, m7 ; G/O
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1727
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1728 ; 8x16 transpose
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1729 TRANSPOSE4x4B 0, 1, 2, 3, 7
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1730 %ifdef m8
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1731 SWAP 1, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1732 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1733 mova q0backup, m1
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1734 %endif
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1735 movh m7, [dst2_reg+ stride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1736 movh m1, [dst8_reg+ stride_reg*2]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1737 punpcklbw m7, m1 ; H/P
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1738 TRANSPOSE4x4B 4, 5, 6, 7, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1739 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1740 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1741 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1742 %ifdef m8
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1743 SWAP 1, 8
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1744 SWAP 2, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1745 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1746 mova m1, q0backup
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1747 mova q0backup, m2 ; store q0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1748 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1749 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1750 %ifdef m12
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1751 SWAP 5, 12
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1752 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1753 mova p0backup, m5 ; store p0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1754 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1755 SWAP 1, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1756 SWAP 2, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1757 SWAP 6, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1758 SWAP 5, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1759 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1760
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1761 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1762 mova m4, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1763 SWAP 4, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1764 psubusb m4, m0 ; p2-p3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1765 psubusb m0, m1 ; p3-p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1766 por m0, m4 ; abs(p3-p2)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1767
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1768 mova m4, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1769 SWAP 4, 2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1770 psubusb m4, m1 ; p1-p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1771 psubusb m1, m2 ; p2-p1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1772 por m1, m4 ; abs(p2-p1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1773
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1774 mova m4, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1775 SWAP 4, 6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1776 psubusb m4, m7 ; q2-q3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1777 psubusb m7, m6 ; q3-q2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1778 por m7, m4 ; abs(q3-q2)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1779
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1780 mova m4, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1781 SWAP 4, 5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1782 psubusb m4, m6 ; q1-q2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1783 psubusb m6, m5 ; q2-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1784 por m6, m4 ; abs(q2-q1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1785
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1786 %ifidn %1, mmx
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1787 mova m4, flim_I
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1788 pxor m3, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1789 psubusb m0, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1790 psubusb m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1791 psubusb m7, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1792 psubusb m6, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1793 pcmpeqb m0, m3 ; abs(p3-p2) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1794 pcmpeqb m1, m3 ; abs(p2-p1) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1795 pcmpeqb m7, m3 ; abs(q3-q2) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1796 pcmpeqb m6, m3 ; abs(q2-q1) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1797 pand m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1798 pand m7, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1799 pand m0, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1800 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1801 pmaxub m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1802 pmaxub m6, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1803 pmaxub m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1804 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1805
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1806 ; normal_limit and high_edge_variance for p1-p0, q1-q0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1807 SWAP 7, 3 ; now m7 is zero
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1808 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1809 movrow m3, [dst_reg +mstride_reg] ; p0
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1810 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1811 movhps m3, [dst8_reg+mstride_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1812 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1813 %elifdef m12
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1814 SWAP 3, 12
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1815 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1816 mova m3, p0backup
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1817 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1818
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1819 mova m1, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1820 SWAP 1, 2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1821 mova m6, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1822 SWAP 3, 6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1823 psubusb m1, m3 ; p1-p0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1824 psubusb m6, m2 ; p0-p1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1825 por m1, m6 ; abs(p1-p0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1826 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1827 mova m6, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1828 psubusb m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1829 psubusb m6, hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1830 pcmpeqb m1, m7 ; abs(p1-p0) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1831 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1832 pand m0, m1
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1833 mova mask_res, m6
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1834 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1835 pmaxub m0, m1 ; max_I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1836 SWAP 1, 4 ; max_hev_thresh
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1837 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1838
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1839 SWAP 6, 4 ; now m6 is I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1840 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1841 movrow m4, [dst_reg] ; q0
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1842 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1843 movhps m4, [dst8_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1844 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1845 %elifdef m8
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1846 SWAP 4, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1847 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1848 mova m4, q0backup
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1849 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1850 mova m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1851 SWAP 1, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1852 mova m7, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1853 SWAP 7, 5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1854 psubusb m1, m5 ; q0-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1855 psubusb m7, m4 ; q1-q0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1856 por m1, m7 ; abs(q1-q0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1857 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1858 mova m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1859 psubusb m1, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1860 psubusb m7, hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1861 pxor m6, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1862 pcmpeqb m1, m6 ; abs(q1-q0) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1863 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1864 mova m6, mask_res
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1865 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1866 pand m6, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1867 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1868 pxor m7, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1869 pmaxub m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1870 pmaxub m6, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1871 psubusb m0, flim_I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1872 psubusb m6, hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1873 pcmpeqb m0, m7 ; max(abs(..)) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1874 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1875 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1876 %ifdef m12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1877 SWAP 6, 12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1878 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1879 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1880 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1881
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1882 ; simple_limit
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1883 mova m1, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1884 SWAP 1, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1885 mova m6, m4 ; keep copies of p0/q0 around for later use
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1886 SWAP 6, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1887 psubusb m1, m4 ; p0-q0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1888 psubusb m6, m3 ; q0-p0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1889 por m1, m6 ; abs(q0-p0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1890 paddusb m1, m1 ; m1=2*abs(q0-p0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1891
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1892 mova m7, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1893 SWAP 7, 2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1894 mova m6, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1895 SWAP 6, 5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1896 psubusb m7, m5 ; p1-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1897 psubusb m6, m2 ; q1-p1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1898 por m7, m6 ; abs(q1-p1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1899 pxor m6, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1900 pand m7, [pb_FE]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1901 psrlq m7, 1 ; abs(q1-p1)/2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1902 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1903 psubusb m7, flim_E
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1904 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1905 pand m0, m7 ; normal_limit result
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1906
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1907 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1908 %ifdef m8 ; x86-64 && sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1909 mova m8, [pb_80]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1910 %define pb_80_var m8
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1911 %else ; x86-32 or mmx/mmxext
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1912 %define pb_80_var [pb_80]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1913 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1914 mova m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1915 mova m7, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1916 pxor m1, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1917 pxor m7, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1918 psubsb m1, m7 ; (signed) q0-p0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1919 mova m6, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1920 mova m7, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1921 pxor m6, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1922 pxor m7, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1923 psubsb m6, m7 ; (signed) p1-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1924 mova m7, mask_res
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1925 pandn m7, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1926 paddsb m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1927 paddsb m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1928 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1929
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1930 pand m7, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1931 mova m1, [pb_F8]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1932 mova m6, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1933 paddsb m7, [pb_3]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1934 paddsb m6, [pb_4]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1935 pand m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1936 pand m6, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1937
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1938 pxor m1, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1939 pxor m0, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1940 pcmpgtb m1, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1941 psubb m0, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1942 psrlq m7, 3 ; +f2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1943 psrlq m0, 3 ; -f2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1944 pand m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1945 pandn m1, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1946 psubusb m3, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1947 paddusb m3, m1 ; p0+f2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1948
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1949 pxor m1, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1950 pxor m0, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1951 pcmpgtb m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1952 psubb m1, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1953 psrlq m6, 3 ; +f1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1954 psrlq m1, 3 ; -f1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1955 pand m1, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1956 pandn m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1957 psubusb m4, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1958 paddusb m4, m1 ; q0-f1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1959
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1960 %ifdef m12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1961 SWAP 6, 12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1962 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1963 mova m6, mask_res
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1964 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1965 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1966 mova m7, [pb_1]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1967 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1968 pxor m7, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1969 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1970 pand m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1971 pand m1, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1972 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1973 paddusb m0, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1974 pand m1, [pb_FE]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1975 pandn m7, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1976 psrlq m1, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1977 psrlq m7, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1978 SWAP 0, 7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1979 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1980 psubusb m1, [pb_1]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1981 pavgb m0, m7 ; a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1982 pavgb m1, m7 ; -a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1983 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1984 psubusb m5, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1985 psubusb m2, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1986 paddusb m5, m1 ; q1-a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1987 paddusb m2, m0 ; p1+a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1988
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1989 ; store
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1990 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1991 movrow [dst_reg +mstride_reg*2], m2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1992 movrow [dst_reg +mstride_reg ], m3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1993 movrow [dst_reg], m4
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1994 movrow [dst_reg + stride_reg ], m5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1995 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1996 movhps [dst8_reg+mstride_reg*2], m2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1997 movhps [dst8_reg+mstride_reg ], m3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1998 movhps [dst8_reg], m4
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1999 movhps [dst8_reg+ stride_reg ], m5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2000 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2001 %else ; h
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2002 add dst_reg, 2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2003 add dst2_reg, 2
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2004
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2005 ; 4x8/16 transpose
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2006 TRANSPOSE4x4B 2, 3, 4, 5, 6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2007
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2008 %if mmsize == 8 ; mmx/mmxext (h)
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2009 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2010 %else ; sse2 (h)
12180
b24153464669 Attempt to fix x86-64 testsuite on fate.
rbultje
parents: 12177
diff changeset
2011 lea dst8_reg, [dst8_reg+mstride_reg+2]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2012 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2013 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2014 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2015
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2016 %if mmsize == 8
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2017 %if %4 == 8 ; chroma
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2018 %ifidn %2, h
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2019 sub dst_reg, 2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2020 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2021 cmp dst_reg, dst8_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2022 mov dst_reg, dst8_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2023 jnz .next8px
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2024 %else
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2025 %ifidn %2, h
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2026 lea dst_reg, [dst_reg + stride_reg*8-2]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2027 %else ; v
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2028 add dst_reg, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2029 %endif
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2030 dec cnt_reg
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2031 jg .next8px
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2032 %endif
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2033 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2034
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2035 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2036 mov rsp, stack_reg ; restore stack pointer
12173
c47ddb7df424 Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents: 12168
diff changeset
2037 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2038 RET
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2039 %endmacro
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2040
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2041 INIT_MMX
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2042 %define SPLATB_REG SPLATB_REG_MMX
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2043 INNER_LOOPFILTER mmx, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2044 INNER_LOOPFILTER mmx, h, 6, 16, 0
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2045 INNER_LOOPFILTER mmx, v, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2046 INNER_LOOPFILTER mmx, h, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2047
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2048 %define SPLATB_REG SPLATB_REG_MMXEXT
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2049 INNER_LOOPFILTER mmxext, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2050 INNER_LOOPFILTER mmxext, h, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2051 INNER_LOOPFILTER mmxext, v, 6, 8, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2052 INNER_LOOPFILTER mmxext, h, 6, 8, 0
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2053
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2054 INIT_XMM
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2055 %define SPLATB_REG SPLATB_REG_SSE2
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2056 INNER_LOOPFILTER sse2, v, 5, 16, 13
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2057 %ifdef m8
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2058 INNER_LOOPFILTER sse2, h, 5, 16, 13
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2059 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2060 INNER_LOOPFILTER sse2, h, 6, 16, 13
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2061 %endif
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2062 INNER_LOOPFILTER sse2, v, 6, 8, 13
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2063 INNER_LOOPFILTER sse2, h, 6, 8, 13
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2064
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2065 %define SPLATB_REG SPLATB_REG_SSSE3
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2066 INNER_LOOPFILTER ssse3, v, 5, 16, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2067 %ifdef m8
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2068 INNER_LOOPFILTER ssse3, h, 5, 16, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2069 %else
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2070 INNER_LOOPFILTER ssse3, h, 6, 16, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2071 %endif
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2072 INNER_LOOPFILTER ssse3, v, 6, 8, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2073 INNER_LOOPFILTER ssse3, h, 6, 8, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2074
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2075 ;-----------------------------------------------------------------------------
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2076 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2077 ; int flimE, int flimI, int hev_thr);
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2078 ;-----------------------------------------------------------------------------
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2079
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2080 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2081 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2082 ; for pre-SSE4:
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2083 ; 3 is a general-purpose register that we will clobber
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2084 ; for SSE4:
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2085 ; 3 is a pointer to the destination's 5th line
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2086 ; 4 is a pointer to the destination's 4th line
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2087 ; 5/6 is -stride and +stride
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2088 ; 7 is optimization string
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2089 %macro WRITE_8W 7
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2090 %ifidn %7, sse4
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2091 pextrw [%4+%5*4], %1, 0
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2092 pextrw [%3+%5*4], %1, 1
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2093 pextrw [%4+%5*2], %1, 2
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2094 pextrw [%4+%5 ], %1, 3
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2095 pextrw [%4 ], %1, 4
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2096 pextrw [%3 ], %1, 5
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2097 pextrw [%3+%6 ], %1, 6
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2098 pextrw [%3+%6*2], %1, 7
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2099 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2100 movd %3, %1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2101 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2102 punpckhdq %1, %1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2103 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2104 psrldq %1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2105 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2106 mov [%4+%5*4], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2107 shr %3, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2108 add %4, %6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2109 mov [%4+%5*4], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2110
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2111 movd %3, %1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2112 %if mmsize == 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2113 psrldq %1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2114 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2115 add %4, %5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2116 mov [%4+%5*2], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2117 shr %3, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2118 mov [%4+%5 ], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2119
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2120 movd %3, %2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2121 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2122 punpckhdq %2, %2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2123 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2124 psrldq %2, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2125 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2126 mov [%4 ], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2127 shr %3, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2128 mov [%4+%6 ], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2129
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2130 movd %3, %2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2131 add %4, %6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2132 mov [%4+%6 ], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2133 shr %3, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2134 mov [%4+%6*2], %3w
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2135 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2136 add %4, %5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2137 %endif
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2138 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2139 %endmacro
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2140
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2141 %macro MBEDGE_LOOPFILTER 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2142 %if %4 == 8 ; chroma
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2143 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2144 %define dst8_reg r1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2145 %define mstride_reg r2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2146 %define E_reg r3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2147 %define I_reg r4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2148 %define hev_thr_reg r5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2149 %else ; luma
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2150 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2151 %define mstride_reg r1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2152 %define E_reg r2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2153 %define I_reg r3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2154 %define hev_thr_reg r4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2155 %ifdef m8 ; x86-64, sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2156 %define dst8_reg r4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2157 %elif mmsize == 16 ; x86-32, sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2158 %define dst8_reg r5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2159 %else ; x86-32, mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2160 %define cnt_reg r5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2161 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2162 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2163 %define dst_reg r0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2164 %define stride_reg E_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2165 %define dst2_reg I_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2166 %ifndef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2167 %define stack_reg hev_thr_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2168 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2169
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2170 %ifnidn %1, sse2 && mmsize == 16
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2171 pxor m7, m7
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2172 %endif
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2173
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2174 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2175 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2176 SPLATB_REG m0, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2177 SPLATB_REG m1, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2178 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2179
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2180 ; align stack
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2181 mov stack_reg, rsp ; backup stack pointer
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2182 and rsp, ~(mmsize-1) ; align stack
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2183 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2184 ; [3]=hev() result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2185 ; [4]=filter tmp result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2186 ; [5]/[6] = p2/q2 backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2187 ; [7]=lim_res sign result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2188
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2189 %define flim_E [rsp]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2190 %define flim_I [rsp+mmsize]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2191 %define hev_thr [rsp+mmsize*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2192 %define mask_res [rsp+mmsize*3]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2193 %define lim_res [rsp+mmsize*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2194 %define p0backup [rsp+mmsize*3]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2195 %define q0backup [rsp+mmsize*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2196 %define p2backup [rsp+mmsize*5]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2197 %define q2backup [rsp+mmsize*6]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2198 %define lim_sign [rsp+mmsize*7]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2199
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2200 mova flim_E, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2201 mova flim_I, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2202 mova hev_thr, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2203
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2204 %else ; sse2 on x86-64
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2206 %define flim_E m9
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2207 %define flim_I m10
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2208 %define hev_thr m11
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2209 %define mask_res m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2210 %define lim_res m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2211 %define p0backup m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2212 %define q0backup m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2213 %define p2backup m13
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2214 %define q2backup m14
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2215 %define lim_sign m15
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2216
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2217 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2218 SPLATB_REG flim_E, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2219 SPLATB_REG flim_I, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2220 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2221 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2222
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2223 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2224 mov cnt_reg, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2225 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2226 mov stride_reg, mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2227 neg mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2228 %ifidn %2, h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2229 lea dst_reg, [dst_reg + stride_reg*4-4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2230 %if %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2231 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2232 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2233 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2234
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2235 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2236 .next8px
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2237 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2238 ; read
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2239 lea dst2_reg, [dst_reg + stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2240 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2241 %if %4 == 8 && mmsize == 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2242 %define movrow movh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2243 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2244 %define movrow mova
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2245 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2246 movrow m0, [dst_reg +mstride_reg*4] ; p3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2247 movrow m1, [dst2_reg+mstride_reg*4] ; p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2248 movrow m2, [dst_reg +mstride_reg*2] ; p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2249 movrow m5, [dst2_reg] ; q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2250 movrow m6, [dst2_reg+ stride_reg] ; q2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2251 movrow m7, [dst2_reg+ stride_reg*2] ; q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2252 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2253 movhps m0, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2254 movhps m2, [dst8_reg+mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2255 add dst8_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2256 movhps m1, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2257 movhps m5, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2258 movhps m6, [dst8_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2259 movhps m7, [dst8_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2260 add dst8_reg, mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2261 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2262 %elif mmsize == 8 ; mmx/mmxext (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2263 ; read 8 rows of 8px each
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2264 movu m0, [dst_reg +mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2265 movu m1, [dst2_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2266 movu m2, [dst_reg +mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2267 movu m3, [dst_reg +mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2268 movu m4, [dst_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2269 movu m5, [dst2_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2270 movu m6, [dst2_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2271
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2272 ; 8x8 transpose
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2273 TRANSPOSE4x4B 0, 1, 2, 3, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2274 mova q0backup, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2275 movu m7, [dst2_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2276 TRANSPOSE4x4B 4, 5, 6, 7, 1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2277 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2278 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2279 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2280 mova m1, q0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2281 mova q0backup, m2 ; store q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2282 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2283 mova p0backup, m5 ; store p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2284 SWAP 1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2285 SWAP 2, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2286 SWAP 6, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2287 SWAP 5, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2288 %else ; sse2 (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2289 %if %4 == 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2290 lea dst8_reg, [dst_reg + stride_reg*8]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2291 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2292
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2293 ; read 16 rows of 8px each, interleave
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2294 movh m0, [dst_reg +mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2295 movh m1, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2296 movh m2, [dst_reg +mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2297 movh m5, [dst8_reg+mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2298 movh m3, [dst_reg +mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2299 movh m6, [dst8_reg+mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2300 movh m4, [dst_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2301 movh m7, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2302 punpcklbw m0, m1 ; A/I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2303 punpcklbw m2, m5 ; C/K
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2304 punpcklbw m3, m6 ; D/L
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2305 punpcklbw m4, m7 ; E/M
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2306
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2307 add dst8_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2308 movh m1, [dst2_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2309 movh m6, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2310 movh m5, [dst2_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2311 movh m7, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2312 punpcklbw m1, m6 ; B/J
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2313 punpcklbw m5, m7 ; F/N
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2314 movh m6, [dst2_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2315 movh m7, [dst8_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2316 punpcklbw m6, m7 ; G/O
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2317
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2318 ; 8x16 transpose
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2319 TRANSPOSE4x4B 0, 1, 2, 3, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2320 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2321 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2322 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2323 mova q0backup, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2324 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2325 movh m7, [dst2_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2326 movh m1, [dst8_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2327 punpcklbw m7, m1 ; H/P
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2328 TRANSPOSE4x4B 4, 5, 6, 7, 1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2329 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2330 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2331 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2332 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2333 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2334 SWAP 2, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2335 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2336 mova m1, q0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2337 mova q0backup, m2 ; store q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2338 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2339 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2340 %ifdef m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2341 SWAP 5, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2342 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2343 mova p0backup, m5 ; store p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2344 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2345 SWAP 1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2346 SWAP 2, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2347 SWAP 6, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2348 SWAP 5, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2349 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2350
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2351 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2352 mova m4, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2353 SWAP 4, 1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2354 psubusb m4, m0 ; p2-p3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2355 psubusb m0, m1 ; p3-p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2356 por m0, m4 ; abs(p3-p2)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2357
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2358 mova m4, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2359 SWAP 4, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2360 psubusb m4, m1 ; p1-p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2361 mova p2backup, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2362 psubusb m1, m2 ; p2-p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2363 por m1, m4 ; abs(p2-p1)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2364
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2365 mova m4, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2366 SWAP 4, 6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2367 psubusb m4, m7 ; q2-q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2368 psubusb m7, m6 ; q3-q2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2369 por m7, m4 ; abs(q3-q2)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2370
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2371 mova m4, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2372 SWAP 4, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2373 psubusb m4, m6 ; q1-q2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2374 mova q2backup, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2375 psubusb m6, m5 ; q2-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2376 por m6, m4 ; abs(q2-q1)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2377
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2378 %ifidn %1, mmx
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2379 mova m4, flim_I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2380 pxor m3, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2381 psubusb m0, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2382 psubusb m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2383 psubusb m7, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2384 psubusb m6, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2385 pcmpeqb m0, m3 ; abs(p3-p2) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2386 pcmpeqb m1, m3 ; abs(p2-p1) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2387 pcmpeqb m7, m3 ; abs(q3-q2) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2388 pcmpeqb m6, m3 ; abs(q2-q1) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2389 pand m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2390 pand m7, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2391 pand m0, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2392 %else ; mmxext/sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2393 pmaxub m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2394 pmaxub m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2395 pmaxub m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2396 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2397
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2398 ; normal_limit and high_edge_variance for p1-p0, q1-q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2399 SWAP 7, 3 ; now m7 is zero
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2400 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2401 movrow m3, [dst_reg +mstride_reg] ; p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2402 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2403 movhps m3, [dst8_reg+mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2404 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2405 %elifdef m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2406 SWAP 3, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2407 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2408 mova m3, p0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2409 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2410
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2411 mova m1, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2412 SWAP 1, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2413 mova m6, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2414 SWAP 3, 6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2415 psubusb m1, m3 ; p1-p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2416 psubusb m6, m2 ; p0-p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2417 por m1, m6 ; abs(p1-p0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2418 %ifidn %1, mmx
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2419 mova m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2420 psubusb m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2421 psubusb m6, hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2422 pcmpeqb m1, m7 ; abs(p1-p0) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2423 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2424 pand m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2425 mova mask_res, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2426 %else ; mmxext/sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2427 pmaxub m0, m1 ; max_I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2428 SWAP 1, 4 ; max_hev_thresh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2429 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2430
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2431 SWAP 6, 4 ; now m6 is I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2432 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2433 movrow m4, [dst_reg] ; q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2434 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2435 movhps m4, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2436 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2437 %elifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2438 SWAP 4, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2439 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2440 mova m4, q0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2441 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2442 mova m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2443 SWAP 1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2444 mova m7, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2445 SWAP 7, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2446 psubusb m1, m5 ; q0-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2447 psubusb m7, m4 ; q1-q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2448 por m1, m7 ; abs(q1-q0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2449 %ifidn %1, mmx
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2450 mova m7, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2451 psubusb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2452 psubusb m7, hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2453 pxor m6, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2454 pcmpeqb m1, m6 ; abs(q1-q0) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2455 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2456 mova m6, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2457 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2458 pand m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2459 %else ; mmxext/sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2460 pxor m7, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2461 pmaxub m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2462 pmaxub m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2463 psubusb m0, flim_I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2464 psubusb m6, hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2465 pcmpeqb m0, m7 ; max(abs(..)) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2466 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2467 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2468 %ifdef m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2469 SWAP 6, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2470 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2471 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2472 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2473
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2474 ; simple_limit
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2475 mova m1, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2476 SWAP 1, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2477 mova m6, m4 ; keep copies of p0/q0 around for later use
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2478 SWAP 6, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2479 psubusb m1, m4 ; p0-q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2480 psubusb m6, m3 ; q0-p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2481 por m1, m6 ; abs(q0-p0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2482 paddusb m1, m1 ; m1=2*abs(q0-p0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2483
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2484 mova m7, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2485 SWAP 7, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2486 mova m6, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2487 SWAP 6, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2488 psubusb m7, m5 ; p1-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2489 psubusb m6, m2 ; q1-p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2490 por m7, m6 ; abs(q1-p1)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2491 pxor m6, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2492 pand m7, [pb_FE]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2493 psrlq m7, 1 ; abs(q1-p1)/2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2494 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2495 psubusb m7, flim_E
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2496 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2497 pand m0, m7 ; normal_limit result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2498
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2499 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2500 %ifdef m8 ; x86-64 && sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2501 mova m8, [pb_80]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2502 %define pb_80_var m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2503 %else ; x86-32 or mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2504 %define pb_80_var [pb_80]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2505 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2506 mova m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2507 mova m7, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2508 pxor m1, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2509 pxor m7, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2510 psubsb m1, m7 ; (signed) q0-p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2511 mova m6, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2512 mova m7, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2513 pxor m6, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2514 pxor m7, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2515 psubsb m6, m7 ; (signed) p1-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2516 mova m7, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2517 paddsb m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2518 paddsb m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2519 paddsb m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2520 pand m6, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2521 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2522 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2523 pand lim_res, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2524 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2525 mova m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2526 pand m0, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2527 mova lim_res, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2528 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2529 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2530
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2531 mova m1, [pb_F8]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2532 mova m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2533 paddsb m7, [pb_3]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2534 paddsb m6, [pb_4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2535 pand m7, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2536 pand m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2537
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2538 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2539 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2540 pcmpgtb m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2541 psubb m0, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2542 psrlq m7, 3 ; +f2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2543 psrlq m0, 3 ; -f2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2544 pand m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2545 pandn m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2546 psubusb m3, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2547 paddusb m3, m1 ; p0+f2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2548
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2549 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2550 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2551 pcmpgtb m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2552 psubb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2553 psrlq m6, 3 ; +f1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2554 psrlq m1, 3 ; -f1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2555 pand m1, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2556 pandn m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2557 psubusb m4, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2558 paddusb m4, m1 ; q0-f1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2559
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2560 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2561 mova m7, [pw_63]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2562 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2563 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2564 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2565 mova m1, lim_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2566 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2567 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2568 mova m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2569 pcmpgtb m0, m1 ; which are negative
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2570 punpcklbw m6, m0 ; signed byte->word
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2571 punpckhbw m1, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2572 mova lim_sign, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2573 mova mask_res, m6 ; backup for later in filter
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2574 mova lim_res, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2575 pmullw m6, [pw_27]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2576 pmullw m1, [pw_27]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2577 paddw m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2578 paddw m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2579 psraw m6, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2580 psraw m1, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2581 packsswb m6, m1 ; a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2582 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2583 psubb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2584 pand m1, m0 ; -a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2585 pandn m0, m6 ; +a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2586 psubusb m3, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2587 paddusb m4, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2588 paddusb m3, m0 ; p0+a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2589 psubusb m4, m0 ; q0-a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2590
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2591 mova m6, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2592 mova m1, lim_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2593 mova m0, lim_sign
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2594 pmullw m6, [pw_18]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2595 pmullw m1, [pw_18]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2596 paddw m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2597 paddw m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2598 psraw m6, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2599 psraw m1, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2600 packsswb m6, m1 ; a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2601 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2602 psubb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2603 pand m1, m0 ; -a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2604 pandn m0, m6 ; +a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2605 psubusb m2, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2606 paddusb m5, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2607 paddusb m2, m0 ; p1+a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2608 psubusb m5, m0 ; q1-a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2609
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2610 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2611 SWAP 6, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2612 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2613 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2614 mova m6, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2615 mova m1, lim_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2616 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2617 pmullw m6, [pw_9]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2618 pmullw m1, [pw_9]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2619 paddw m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2620 paddw m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2621 %ifdef m15
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2622 SWAP 7, 15
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2623 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2624 mova m7, lim_sign
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2625 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2626 psraw m6, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2627 psraw m1, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2628 packsswb m6, m1 ; a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2629 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2630 psubb m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2631 pand m0, m7 ; -a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2632 pandn m7, m6 ; +a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2633 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2634 SWAP 1, 13
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2635 SWAP 6, 14
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2636 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2637 mova m1, p2backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2638 mova m6, q2backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2639 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2640 psubusb m1, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2641 paddusb m6, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2642 paddusb m1, m7 ; p1+a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2643 psubusb m6, m7 ; q1-a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2644
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2645 ; store
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2646 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2647 movrow [dst2_reg+mstride_reg*4], m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2648 movrow [dst_reg +mstride_reg*2], m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2649 movrow [dst_reg +mstride_reg ], m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2650 movrow [dst_reg], m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2651 movrow [dst2_reg], m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2652 movrow [dst2_reg+ stride_reg ], m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2653 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2654 add dst8_reg, mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2655 movhps [dst8_reg+mstride_reg*2], m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2656 movhps [dst8_reg+mstride_reg ], m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2657 movhps [dst8_reg], m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2658 add dst8_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2659 movhps [dst8_reg], m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2660 movhps [dst8_reg+ stride_reg ], m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2661 movhps [dst8_reg+ stride_reg*2], m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2662 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2663 %else ; h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2664 inc dst_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2665 inc dst2_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2666
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2667 ; 4x8/16 transpose
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2668 TRANSPOSE4x4B 1, 2, 3, 4, 0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2669 SBUTTERFLY bw, 5, 6, 0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2670
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2671 %if mmsize == 8 ; mmx/mmxext (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2672 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2673 add dst_reg, 4
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2674 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2675 %else ; sse2 (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2676 lea dst8_reg, [dst8_reg+mstride_reg+1]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2677 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
12214
657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents: 12211
diff changeset
2678 lea dst_reg, [dst2_reg+mstride_reg+4]
657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents: 12211
diff changeset
2679 lea dst8_reg, [dst8_reg+mstride_reg+4]
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2680 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2681 %ifidn %2, sse4
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2682 lea dst_reg, [dst8_reg+ stride_reg]
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2683 %endif
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2684 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2685 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2686 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2687
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2688 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2689 %if %4 == 8 ; chroma
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2690 %ifidn %2, h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2691 sub dst_reg, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2692 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2693 cmp dst_reg, dst8_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2694 mov dst_reg, dst8_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2695 jnz .next8px
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2696 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2697 %ifidn %2, h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2698 lea dst_reg, [dst_reg + stride_reg*8-5]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2699 %else ; v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2700 add dst_reg, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2701 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2702 dec cnt_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2703 jg .next8px
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2704 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2705 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2706
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2707 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2708 mov rsp, stack_reg ; restore stack pointer
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2709 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2710 RET
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2711 %endmacro
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2712
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2713 INIT_MMX
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2714 %define SPLATB_REG SPLATB_REG_MMX
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2715 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2716 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2717 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2718 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2719
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2720 %define SPLATB_REG SPLATB_REG_MMXEXT
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2721 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2722 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2723 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2724 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2725
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2726 INIT_XMM
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2727 %define SPLATB_REG SPLATB_REG_SSE2
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2728 MBEDGE_LOOPFILTER sse2, v, 5, 16, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2729 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2730 MBEDGE_LOOPFILTER sse2, h, 5, 16, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2731 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2732 MBEDGE_LOOPFILTER sse2, h, 6, 16, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2733 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2734 MBEDGE_LOOPFILTER sse2, v, 6, 8, 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2735 MBEDGE_LOOPFILTER sse2, h, 6, 8, 16
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2736
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2737 %define SPLATB_REG SPLATB_REG_SSSE3
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2738 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2739 %ifdef m8
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2740 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2741 %else
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2742 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2743 %endif
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2744 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2745 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2746
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2747 %ifdef m8
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2748 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2749 %else
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2750 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2751 %endif
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2752 MBEDGE_LOOPFILTER sse4, h, 6, 8, 16