annotate x86/vp8dsp.asm @ 12492:58a960d6e34c libavcodec

Rename h264_idct_sse2.asm to h264_idct.asm; move inline IDCT asm from h264dsp_mmx.c to h264_idct.asm (as yasm code). Because the loops are now coded in asm instead of C, this is (depending on the function) up to 50% faster for cases where gcc didn't do a great job at looping. Since h264_idct_add8() is now faster than the manual loop setup in h264.c, in-asm idct calling can now be enabled for chroma as well (see r16207). For MMX, this is 5% faster. For SSE2 (which isn't done for chroma if h264.c does the looping), this makes it up to 50% faster. Speed gain overall is ~0.5-1.0%.
author rbultje
date Tue, 14 Sep 2010 13:36:26 +0000
parents 2982071047a2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
2 ;* VP8 MMXEXT optimizations
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
5 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
6 ;* This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
7 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
8 ;* FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
9 ;* modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
10 ;* License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
11 ;* version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
12 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
13 ;* FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
16 ;* Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
17 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
18 ;* You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
19 ;* License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
21 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
22
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
23 %include "x86inc.asm"
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
24 %include "x86util.asm"
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
25
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
26 SECTION_RODATA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
27
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
28 fourtap_filter_hw_m: times 4 dw -6, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
29 times 4 dw 12, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
30 times 4 dw -9, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
31 times 4 dw 50, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
32 times 4 dw -6, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
33 times 4 dw 93, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
34 times 4 dw -1, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
35 times 4 dw 123, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
37 sixtap_filter_hw_m: times 4 dw 2, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
38 times 4 dw 108, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
39 times 4 dw -8, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
40 times 4 dw 3, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
41 times 4 dw 77, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
42 times 4 dw -16, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
43 times 4 dw 1, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
44 times 4 dw 36, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
45 times 4 dw -11, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
46
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
47 fourtap_filter_hb_m: times 8 db -6, 123
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
48 times 8 db 12, -1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
49 times 8 db -9, 93
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
50 times 8 db 50, -6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
51 times 8 db -6, 50
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
52 times 8 db 93, -9
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
53 times 8 db -1, 12
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
54 times 8 db 123, -6
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
55
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
56 sixtap_filter_hb_m: times 8 db 2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
57 times 8 db -11, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
58 times 8 db 36, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
59 times 8 db 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
60 times 8 db -16, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
61 times 8 db 77, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
62 times 8 db 1, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
63 times 8 db -8, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
64 times 8 db 108, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
65
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
66 fourtap_filter_v_m: times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
67 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
68 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
69 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
70 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
71 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
72 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
73 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
74 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
75 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
76 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
77 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
78 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
79 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
80 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
81 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
82
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
83 sixtap_filter_v_m: times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
84 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
85 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
86 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
87 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
88 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
89 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
90 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
91 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
92 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
93 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
94 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
95 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
96 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
97 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
98 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
99 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
100 times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
101
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
102 bilinear_filter_vw_m: times 8 dw 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
103 times 8 dw 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
104 times 8 dw 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
105 times 8 dw 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
106 times 8 dw 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
107 times 8 dw 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
108 times 8 dw 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
109
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
110 bilinear_filter_vb_m: times 8 db 7, 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
111 times 8 db 6, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
112 times 8 db 5, 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
113 times 8 db 4, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
114 times 8 db 3, 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
115 times 8 db 2, 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
116 times 8 db 1, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
117
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
118 %ifdef PIC
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
119 %define fourtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
120 %define sixtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
121 %define fourtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
122 %define sixtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
123 %define fourtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
124 %define sixtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
125 %define bilinear_filter_vw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
126 %define bilinear_filter_vb r11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
127 %else
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
128 %define fourtap_filter_hw fourtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
129 %define sixtap_filter_hw sixtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
130 %define fourtap_filter_hb fourtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
131 %define sixtap_filter_hb sixtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
132 %define fourtap_filter_v fourtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
133 %define sixtap_filter_v sixtap_filter_v_m
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
134 %define bilinear_filter_vw bilinear_filter_vw_m
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
135 %define bilinear_filter_vb bilinear_filter_vb_m
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
136 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
137
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
140
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
144
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
145 pw_20091: times 4 dw 20091
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
146 pw_17734: times 4 dw 17734
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
147
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
148 pb_27_63: times 8 db 27, 63
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
149 pb_18_63: times 8 db 18, 63
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
150 pb_9_63: times 8 db 9, 63
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
151
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
152 cextern pb_1
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
153 cextern pw_3
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
154 cextern pb_3
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
155 cextern pw_4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
156 cextern pb_4
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
157 cextern pw_9
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
158 cextern pw_18
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
159 cextern pw_27
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
160 cextern pw_63
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
161 cextern pw_64
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
162 cextern pb_80
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
163 cextern pb_F8
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
164 cextern pb_FE
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
165
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
166 SECTION .text
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
167
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
168 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
169 ; subpel MC functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
170 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
171 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
172 ; uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
173 ; int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
174 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
175
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
176 %macro FILTER_SSSE3 3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
177 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
178 lea r5d, [r5*3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
179 mova m3, [filter_h6_shuf2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
180 mova m4, [filter_h6_shuf3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
181 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
182 lea r11, [sixtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
183 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
184 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
185 mova m6, [sixtap_filter_hb+r5*8-32]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
186 mova m7, [sixtap_filter_hb+r5*8-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
187
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
188 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
189 movu m0, [r2-2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
190 mova m1, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
191 mova m2, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
192 %ifidn %1, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
193 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
194 ; shuffle with a memory operand
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
195 punpcklbw m0, [r2+3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
196 %else
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
197 pshufb m0, [filter_h6_shuf1]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
198 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
199 pshufb m1, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
200 pshufb m2, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
201 pmaddubsw m0, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
202 pmaddubsw m1, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
203 pmaddubsw m2, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
204 paddsw m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
205 paddsw m0, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
206 paddsw m0, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
207 psraw m0, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
208 packuswb m0, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
209 movh [r0], m0 ; store
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
210
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
211 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
212 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
213 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
214 dec r4d ; next row
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
215 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
216 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
217
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
218 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
219 shl r5d, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
220 mova m2, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
221 mova m3, [filter_h2_shuf]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
222 mova m4, [filter_h4_shuf]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
223 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
224 lea r11, [fourtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
225 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
226 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
227 mova m6, [fourtap_filter_hb+r5]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
228
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
229 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
230 movu m0, [r2-1]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
231 mova m1, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
232 pshufb m0, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
233 pshufb m1, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
234 pmaddubsw m0, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
235 pmaddubsw m1, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
236 paddsw m0, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
237 paddsw m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
238 psraw m0, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
239 packuswb m0, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
240 movh [r0], m0 ; store
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
241
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
242 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
243 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
244 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
245 dec r4d ; next row
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
246 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
247 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
248
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
249 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
250 shl r6d, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
251 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
252 lea r11, [fourtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
253 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
254 mova m5, [fourtap_filter_hb+r6-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
255 mova m6, [fourtap_filter_hb+r6]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
256 mova m7, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
257
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
258 ; read 3 lines
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
259 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
260 movh m0, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
261 movh m1, [r2+ r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
262 movh m2, [r2+2*r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
263 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
264
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
265 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
266 movh m3, [r2+2*r3] ; read new row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
267 mova m4, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
268 mova m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
269 punpcklbw m4, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
270 mova m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
271 punpcklbw m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
272 pmaddubsw m4, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
273 pmaddubsw m2, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
274 paddsw m4, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
275 mova m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
276 paddsw m4, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
277 psraw m4, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
278 packuswb m4, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
279 movh [r0], m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
280
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
281 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
282 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
283 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
284 dec r4d ; next row
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
285 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
286 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
287
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
288 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
289 lea r6d, [r6*3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
290 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
291 lea r11, [sixtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
292 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
293 lea r6, [sixtap_filter_hb+r6*8]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
294
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
295 ; read 5 lines
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
296 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
297 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
298 movh m0, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
299 movh m1, [r2+r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
300 movh m2, [r2+r3*2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
301 lea r2, [r2+r3*2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
302 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
303 movh m3, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
304 movh m4, [r2+r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
305
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
306 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
307 movh m5, [r2+2*r3] ; read new row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
308 mova m6, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
309 punpcklbw m6, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
310 mova m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
311 punpcklbw m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
312 mova m7, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
313 punpcklbw m7, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
314 pmaddubsw m6, [r6-48]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
315 pmaddubsw m1, [r6-32]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
316 pmaddubsw m7, [r6-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
317 paddsw m6, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
318 paddsw m6, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
319 mova m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
320 paddsw m6, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
321 mova m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
322 psraw m6, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
323 mova m3, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
324 packuswb m6, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
325 mova m4, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
326 movh [r0], m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
327
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
328 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
329 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
330 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
331 dec r4d ; next row
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
332 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
333 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
334 %endmacro
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
335
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
336 INIT_MMX
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
337 FILTER_SSSE3 4, 0, 0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
338 INIT_XMM
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
339 FILTER_SSSE3 8, 8, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
340
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
341 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
342 cglobal put_vp8_epel4_h4_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
343 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
344 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
345 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
346 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
347 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
348 movq mm5, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
349 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
350 pxor mm6, mm6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
351
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
352 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
353 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
354
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
355 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
356 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
357 punpcklbw mm1, mm6 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
358 pshufw mm0, mm2, 9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
359 punpcklbw mm0, mm6 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
360 pshufw mm3, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
361 pshufw mm1, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
362 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
363 movq mm0, mm1 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
364 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
365 paddd mm3, mm1 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
366
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
367 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
368 punpckhbw mm2, mm6 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
369 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
370 pshufw mm1, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
371 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
372 paddd mm0, mm1 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
373
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
374 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
375 packssdw mm3, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
376 paddsw mm3, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
377 psraw mm3, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
378 packuswb mm3, mm6 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
379 movd [r0], mm3 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
380
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
381 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
382 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
383 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
384 dec r4d ; next row
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
385 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
386 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
387
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
388 ; 4x4 block, H-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
389 cglobal put_vp8_epel4_h6_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
390 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
391 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
392 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
393 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
395 movq mm5, [sixtap_filter_hw+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
396 movq mm6, [sixtap_filter_hw+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
397 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
398 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
399
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
400 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
401 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
402
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
403 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
404 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
405 punpcklbw mm1, mm3 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
406 pshufw mm0, mm2, 0x9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
407 punpckhbw mm2, mm3 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
408 punpcklbw mm0, mm3 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
409 pshufw mm1, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
410 pshufw mm2, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
412 pshufw mm3, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
413 movq mm0, mm3 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
415 paddd mm1, mm3 ; add to 1st 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
416 movq mm3, mm2 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
418 paddd mm1, mm2 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
419
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
420 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
421 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
424 paddd mm0, mm3 ; add to 2nd 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
425 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
426 punpcklbw mm2, mm3 ; byte->word FGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
427 pshufw mm2, mm2, 0xE9 ; word GHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
429 paddd mm0, mm2 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
430
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
431 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
432 packssdw mm1, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
433 paddsw mm1, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
434 psraw mm1, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
435 packuswb mm1, mm3 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
436 movd [r0], mm1 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
437
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
438 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
439 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
440 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
441 dec r4d ; next row
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
442 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
443 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
444
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
445 INIT_XMM
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
446 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
447 shl r5d, 5
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
448 %ifdef PIC
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
449 lea r11, [fourtap_filter_v_m]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
450 %endif
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
451 lea r5, [fourtap_filter_v+r5-32]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
452 pxor m7, m7
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
453 mova m4, [pw_64]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
454 mova m5, [r5+ 0]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
455 mova m6, [r5+16]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
456 %ifdef m8
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
457 mova m8, [r5+32]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
458 mova m9, [r5+48]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
459 %endif
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
460 .nextrow
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
461 movq m0, [r2-1]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
462 movq m1, [r2-0]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
463 movq m2, [r2+1]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
464 movq m3, [r2+2]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
465 punpcklbw m0, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
466 punpcklbw m1, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
467 punpcklbw m2, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
468 punpcklbw m3, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
469 pmullw m0, m5
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
470 pmullw m1, m6
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
471 %ifdef m8
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
472 pmullw m2, m8
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
473 pmullw m3, m9
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
474 %else
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
475 pmullw m2, [r5+32]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
476 pmullw m3, [r5+48]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
477 %endif
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
478 paddsw m0, m1
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
479 paddsw m2, m3
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
480 paddsw m0, m2
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
481 paddsw m0, m4
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
482 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
483 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
484 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
485
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
486 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
487 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
488 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
489 dec r4d ; next row
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
490 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
491 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
492
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
493 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
494 lea r5d, [r5*3]
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
495 shl r5d, 4
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
496 %ifdef PIC
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
497 lea r11, [sixtap_filter_v_m]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
498 %endif
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
499 lea r5, [sixtap_filter_v+r5-96]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
500 pxor m7, m7
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
501 mova m6, [pw_64]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
502 %ifdef m8
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
503 mova m8, [r5+ 0]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
504 mova m9, [r5+16]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
505 mova m10, [r5+32]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
506 mova m11, [r5+48]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
507 mova m12, [r5+64]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
508 mova m13, [r5+80]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
509 %endif
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
510 .nextrow
12278
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
511 movq m0, [r2-2]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
512 movq m1, [r2-1]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
513 movq m2, [r2-0]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
514 movq m3, [r2+1]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
515 movq m4, [r2+2]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
516 movq m5, [r2+3]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
517 punpcklbw m0, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
518 punpcklbw m1, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
519 punpcklbw m2, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
520 punpcklbw m3, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
521 punpcklbw m4, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
522 punpcklbw m5, m7
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
523 %ifdef m8
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
524 pmullw m0, m8
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
525 pmullw m1, m9
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
526 pmullw m2, m10
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
527 pmullw m3, m11
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
528 pmullw m4, m12
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
529 pmullw m5, m13
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
530 %else
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
531 pmullw m0, [r5+ 0]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
532 pmullw m1, [r5+16]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
533 pmullw m2, [r5+32]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
534 pmullw m3, [r5+48]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
535 pmullw m4, [r5+64]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
536 pmullw m5, [r5+80]
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
537 %endif
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
538 paddsw m1, m4
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
539 paddsw m0, m5
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
540 paddsw m1, m2
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
541 paddsw m0, m3
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
542 paddsw m0, m1
da5b503f050d VP8: Much faster SSE2 MC
darkshikari
parents: 12276
diff changeset
543 paddsw m0, m6
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
544 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
545 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
546 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
547
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
548 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
549 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
550 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
551 dec r4d ; next row
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
552 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
553 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
554
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
555 %macro FILTER_V 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
556 ; 4x4 block, V-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
557 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
558 shl r6d, 5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
559 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
560 lea r11, [fourtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
561 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
562 lea r6, [fourtap_filter_v+r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
563 mova m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
564 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
565 mova m5, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
566
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
567 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
568 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
569 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
570 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
571 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
572 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
573 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
574 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
575 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
576
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
577 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
578 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
579 movh m4, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
580 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
581 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
582 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
583 pmullw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
584 paddsw m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
585
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
586 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
587 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
588 pmullw m1, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
589 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
590 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
591 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
592 paddsw m4, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
593 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
594
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
595 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
596 paddsw m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
597 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
598 packuswb m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
599 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
600
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
601 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
602 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
603 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
604 dec r4d ; next row
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
605 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
606 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
607
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
608
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
609 ; 4x4 block, V-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
610 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
611 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
612 lea r6, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
613 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
614 lea r11, [sixtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
615 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
616 lea r6, [sixtap_filter_v+r6-96]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
617 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
618
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
619 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
620 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
621 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
622 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
623 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
624 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
625 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
626 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
627 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
628 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
629 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
630 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
631 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
632 punpcklbw m3, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
633 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
634
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
635 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
636 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
637 mova m5, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
638 pmullw m5, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
639 mova m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
640 pmullw m6, [r6+64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
641 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
642
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
643 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
644 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
645 punpcklbw m5, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
646 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
647 paddsw m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
648 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
649 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
650 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
651 paddsw m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
652 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
653 pmullw m3, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
654 paddsw m6, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
655 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
656 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
657 pmullw m5, [r6+80]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
658 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
659
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
660 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
661 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
662 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
663 packuswb m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
664 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
665
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
666 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
667 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
668 add r2, r3
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
669 dec r4d ; next row
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
670 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
671 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
672 %endmacro
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
673
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
674 INIT_MMX
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
675 FILTER_V mmxext, 4, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
676 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
677 FILTER_V sse2, 8, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
678
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
679 %macro FILTER_BILINEAR 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
680 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
681 mov r5d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
682 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
683 sub r5d, r6d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
684 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
685 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
686 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
687 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
688 mova m4, [bilinear_filter_vw+r5-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
689 mova m5, [bilinear_filter_vw+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
690 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
691 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
692 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
693 movh m3, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
694 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
695 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
696 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
697 mova m2, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
698 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
699 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
700 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
701 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
702 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
703 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
704 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
705 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
706 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
707 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
708 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
709 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
710 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
711 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
712 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
713 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
714 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
715 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
716 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
717 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
718
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
719 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
720 lea r2, [r2+r3*2]
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
721 sub r4d, 2
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
722 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
723 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
724
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
725 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
726 mov r6d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
727 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
728 sub r6d, r5d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
729 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
730 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
731 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
732 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
733 mova m4, [bilinear_filter_vw+r6-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
734 mova m5, [bilinear_filter_vw+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
735 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
736 movh m0, [r2+r3*0+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
737 movh m1, [r2+r3*0+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
738 movh m2, [r2+r3*1+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
739 movh m3, [r2+r3*1+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
740 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
741 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
742 punpcklbw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
743 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
744 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
745 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
746 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
747 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
748 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
749 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
750 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
751 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
752 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
753 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
754 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
755 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
756 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
757 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
758 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
759 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
760 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
761 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
762 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
763 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
764
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
765 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
766 lea r2, [r2+r3*2]
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
767 sub r4d, 2
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
768 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
769 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
770 %endmacro
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
771
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
772 INIT_MMX
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
773 FILTER_BILINEAR mmxext, 4, 0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
774 INIT_XMM
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
775 FILTER_BILINEAR sse2, 8, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
776
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
777 %macro FILTER_BILINEAR_SSSE3 1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
778 cglobal put_vp8_bilinear%1_v_ssse3, 7,7
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
779 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
780 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
781 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
782 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
783 pxor m4, m4
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
784 mova m3, [bilinear_filter_vb+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
785 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
786 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
787 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
788 movh m2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
789 punpcklbw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
790 punpcklbw m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
791 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
792 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
793 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
794 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
795 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
796 pavgw m1, m4
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
797 %if mmsize==8
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
798 packuswb m0, m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
799 packuswb m1, m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
800 movh [r0+r1*0], m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
801 movh [r0+r1*1], m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
802 %else
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
803 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
804 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
805 movhps [r0+r1*1], m0
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
806 %endif
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
807
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
808 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
809 lea r2, [r2+r3*2]
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
810 sub r4d, 2
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
811 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
812 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
813
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
814 cglobal put_vp8_bilinear%1_h_ssse3, 7,7
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
815 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
816 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
817 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
818 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
819 pxor m4, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
820 mova m2, [filter_h2_shuf]
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
821 mova m3, [bilinear_filter_vb+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
822 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
823 movu m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
824 movu m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
825 pshufb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
826 pshufb m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
827 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
828 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
829 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
830 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
831 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
832 pavgw m1, m4
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
833 %if mmsize==8
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
834 packuswb m0, m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
835 packuswb m1, m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
836 movh [r0+r1*0], m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
837 movh [r0+r1*1], m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
838 %else
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
839 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
840 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
841 movhps [r0+r1*1], m0
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
842 %endif
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
843
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
844 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
845 lea r2, [r2+r3*2]
12400
4f13b2ded34d Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents: 12340
diff changeset
846 sub r4d, 2
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
847 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
848 REP_RET
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
849 %endmacro
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
850
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
851 INIT_MMX
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
852 FILTER_BILINEAR_SSSE3 4
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
853 INIT_XMM
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
854 FILTER_BILINEAR_SSSE3 8
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
855
11992
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
856 cglobal put_vp8_pixels8_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
857 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
858 movq mm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
859 movq mm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
860 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
861 movq [r0+r1*0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
862 movq [r0+r1*1], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
863 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
864 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
865 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
866 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
867
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
868 cglobal put_vp8_pixels16_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
869 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
870 movq mm0, [r2+r3*0+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
871 movq mm1, [r2+r3*0+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
872 movq mm2, [r2+r3*1+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
873 movq mm3, [r2+r3*1+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
874 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
875 movq [r0+r1*0+0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
876 movq [r0+r1*0+8], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
877 movq [r0+r1*1+0], mm2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
878 movq [r0+r1*1+8], mm3
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
879 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
880 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
881 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
882 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
883
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
884 cglobal put_vp8_pixels16_sse, 5,5,2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
885 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
886 movups xmm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
887 movups xmm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
888 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
889 movaps [r0+r1*0], xmm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
890 movaps [r0+r1*1], xmm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
891 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
892 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
893 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
894 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
895
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
896 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
897 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
898 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
899
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
900 %macro ADD_DC 4
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
901 %4 m2, [r0+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
902 %4 m3, [r0+r2+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
903 %4 m4, [r1+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
904 %4 m5, [r1+r2+%3]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
905 paddusb m2, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
906 paddusb m3, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
907 paddusb m4, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
908 paddusb m5, %1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
909 psubusb m2, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
910 psubusb m3, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
911 psubusb m4, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
912 psubusb m5, %2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
913 %4 [r0+%3], m2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
914 %4 [r0+r2+%3], m3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
915 %4 [r1+%3], m4
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
916 %4 [r1+r2+%3], m5
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
917 %endmacro
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
918
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
919 INIT_MMX
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
920 cglobal vp8_idct_dc_add_mmx, 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
921 ; load data
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
922 movd m0, [r1]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
923
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
924 ; calculate DC
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
925 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
926 pxor m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
927 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
928 movd [r1], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
929 psubw m1, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
930 packuswb m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
931 packuswb m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
932 punpcklbw m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
933 punpcklbw m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
934 punpcklwd m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
935 punpcklwd m1, m1
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
936
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
937 ; add DC
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
938 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
939 ADD_DC m0, m1, 0, movh
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
940 RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
941
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
942 INIT_XMM
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
943 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
944 ; load data
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
945 movd m0, [r1]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
946 pxor m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
947
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
948 ; calculate DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
949 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
950 movd [r1], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
951 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
952 movd m2, [r0]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
953 movd m3, [r0+r2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
954 movd m4, [r1]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
955 movd m5, [r1+r2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
956 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
957 pshuflw m0, m0, 0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
958 punpcklqdq m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
959 punpckldq m2, m3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
960 punpckldq m4, m5
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
961 punpcklbw m2, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
962 punpcklbw m4, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
963 paddw m2, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
964 paddw m4, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
965 packuswb m2, m4
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
966 movd [r0], m2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
967 pextrd [r0+r2], m2, 1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
968 pextrd [r1], m2, 2
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
969 pextrd [r1+r2], m2, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
970 RET
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
971
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
972 ;-----------------------------------------------------------------------------
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
973 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
974 ;-----------------------------------------------------------------------------
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
975
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
976 INIT_MMX
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
977 cglobal vp8_idct_dc_add4y_mmx, 3, 3
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
978 ; load data
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
979 movd m0, [r1+32*0] ; A
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
980 movd m1, [r1+32*2] ; C
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
981 punpcklwd m0, [r1+32*1] ; A B
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
982 punpcklwd m1, [r1+32*3] ; C D
12239
13b1ad24a4b1 VP8 asm: cosmetics (spacing)
darkshikari
parents: 12238
diff changeset
983 punpckldq m0, m1 ; A B C D
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
984 pxor m6, m6
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
985
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
986 ; calculate DC
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
987 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
988 movd [r1+32*0], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
989 movd [r1+32*1], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
990 movd [r1+32*2], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
991 movd [r1+32*3], m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
992 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
993 psubw m6, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
994 packuswb m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
995 packuswb m6, m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
996 punpcklbw m0, m0 ; AABBCCDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
997 punpcklbw m6, m6 ; AABBCCDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
998 movq m1, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
999 movq m7, m6
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1000 punpcklbw m0, m0 ; AAAABBBB
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1001 punpckhbw m1, m1 ; CCCCDDDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1002 punpcklbw m6, m6 ; AAAABBBB
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1003 punpckhbw m7, m7 ; CCCCDDDD
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1004
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1005 ; add DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1006 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1007 ADD_DC m0, m6, 0, mova
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1008 ADD_DC m1, m7, 8, mova
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1009 RET
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1010
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1011 INIT_XMM
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1012 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1013 ; load data
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1014 movd m0, [r1+32*0] ; A
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1015 movd m1, [r1+32*2] ; C
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1016 punpcklwd m0, [r1+32*1] ; A B
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1017 punpcklwd m1, [r1+32*3] ; C D
12239
13b1ad24a4b1 VP8 asm: cosmetics (spacing)
darkshikari
parents: 12238
diff changeset
1018 punpckldq m0, m1 ; A B C D
12238
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1019 pxor m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1020
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1021 ; calculate DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1022 paddw m0, [pw_4]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1023 movd [r1+32*0], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1024 movd [r1+32*1], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1025 movd [r1+32*2], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1026 movd [r1+32*3], m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1027 psraw m0, 3
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1028 psubw m1, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1029 packuswb m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1030 packuswb m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1031 punpcklbw m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1032 punpcklbw m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1033 punpcklbw m0, m0
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1034 punpcklbw m1, m1
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1035
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1036 ; add DC
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1037 lea r1, [r0+r2*2]
1a7903913e9b VP8: 30% faster idct_mb
darkshikari
parents: 12235
diff changeset
1038 ADD_DC m0, m1, 0, mova
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1039 RET
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1040
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1041 ;-----------------------------------------------------------------------------
12241
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1042 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1043 ;-----------------------------------------------------------------------------
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1044
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1045 INIT_MMX
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1046 cglobal vp8_idct_dc_add4uv_mmx, 3, 3
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1047 ; load data
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1048 movd m0, [r1+32*0] ; A
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1049 movd m1, [r1+32*2] ; C
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1050 punpcklwd m0, [r1+32*1] ; A B
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1051 punpcklwd m1, [r1+32*3] ; C D
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1052 punpckldq m0, m1 ; A B C D
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1053 pxor m6, m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1054
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1055 ; calculate DC
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1056 paddw m0, [pw_4]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1057 movd [r1+32*0], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1058 movd [r1+32*1], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1059 movd [r1+32*2], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1060 movd [r1+32*3], m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1061 psraw m0, 3
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1062 psubw m6, m0
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1063 packuswb m0, m0
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1064 packuswb m6, m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1065 punpcklbw m0, m0 ; AABBCCDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1066 punpcklbw m6, m6 ; AABBCCDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1067 movq m1, m0
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1068 movq m7, m6
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1069 punpcklbw m0, m0 ; AAAABBBB
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1070 punpckhbw m1, m1 ; CCCCDDDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1071 punpcklbw m6, m6 ; AAAABBBB
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1072 punpckhbw m7, m7 ; CCCCDDDD
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1073
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1074 ; add DC
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1075 lea r1, [r0+r2*2]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1076 ADD_DC m0, m6, 0, mova
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1077 lea r0, [r0+r2*4]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1078 lea r1, [r1+r2*4]
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1079 ADD_DC m1, m7, 0, mova
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1080 RET
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1081
c7f6ddcc5c01 VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents: 12239
diff changeset
1082 ;-----------------------------------------------------------------------------
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1083 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1084 ;-----------------------------------------------------------------------------
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1085
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1086 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1087 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1088 %macro VP8_MULTIPLY_SUMSUB 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1089 mova %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1090 mova %4, %2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1091 pmulhw %3, m6 ;20091(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1092 pmulhw %4, m6 ;20091(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1093 paddw %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1094 paddw %4, %2
12018
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
1095 paddw %1, %1
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
1096 paddw %2, %2
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1097 pmulhw %1, m7 ;35468(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1098 pmulhw %2, m7 ;35468(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1099 psubw %1, %4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1100 paddw %2, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1101 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1102
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1103 ; calculate x0=%1+%3; x1=%1-%3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1104 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1105 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1106 ; %5/%6 are temporary registers
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1107 ; we assume m6/m7 have constant words 20091/17734 loaded in them
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1108 %macro VP8_IDCT_TRANSFORM4x4_1D 6
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1109 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1110 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1111 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1112 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1113 SWAP %4, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1114 SWAP %4, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1115 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1116
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1117 INIT_MMX
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1118 %macro VP8_IDCT_ADD 1
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1119 cglobal vp8_idct_add_%1, 3, 3
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1120 ; load block data
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1121 movq m0, [r1+ 0]
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1122 movq m1, [r1+ 8]
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1123 movq m2, [r1+16]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1124 movq m3, [r1+24]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1125 movq m6, [pw_20091]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1126 movq m7, [pw_17734]
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1127 %ifidn %1, sse
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1128 xorps xmm0, xmm0
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1129 movaps [r1+ 0], xmm0
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1130 movaps [r1+16], xmm0
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1131 %else
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1132 pxor m4, m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1133 movq [r1+ 0], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1134 movq [r1+ 8], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1135 movq [r1+16], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1136 movq [r1+24], m4
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1137 %endif
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1138
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1139 ; actual IDCT
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1140 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1141 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1142 paddw m0, [pw_4]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1143 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1144 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1145
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1146 ; store
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1147 pxor m4, m4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1148 lea r1, [r0+2*r2]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1149 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1150 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1151
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1152 RET
12235
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1153 %endmacro
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1154
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1155 VP8_IDCT_ADD mmx
e08d65897115 VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents: 12227
diff changeset
1156 VP8_IDCT_ADD sse
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1157
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1158 ;-----------------------------------------------------------------------------
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1159 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1160 ;-----------------------------------------------------------------------------
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1161
12209
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1162 %macro SCATTER_WHT 3
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1163 movd r1d, m%1
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1164 movd r2d, m%2
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1165 mov [r0+2*16*(0+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1166 mov [r0+2*16*(1+%3)], r2w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1167 shr r1d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1168 shr r2d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1169 psrlq m%1, 32
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1170 psrlq m%2, 32
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1171 mov [r0+2*16*(4+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1172 mov [r0+2*16*(5+%3)], r2w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1173 movd r1d, m%1
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1174 movd r2d, m%2
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1175 mov [r0+2*16*(8+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1176 mov [r0+2*16*(9+%3)], r2w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1177 shr r1d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1178 shr r2d, 16
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1179 mov [r0+2*16*(12+%3)], r1w
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1180 mov [r0+2*16*(13+%3)], r2w
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1181 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1182
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1183 %macro HADAMARD4_1D 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1184 SUMSUB_BADC m%2, m%1, m%4, m%3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1185 SUMSUB_BADC m%4, m%2, m%3, m%1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1186 SWAP %1, %4, %3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1187 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1188
12340
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1189 %macro VP8_DC_WHT 1
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1190 cglobal vp8_luma_dc_wht_%1, 2,3
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1191 movq m0, [r1]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1192 movq m1, [r1+8]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1193 movq m2, [r1+16]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1194 movq m3, [r1+24]
12340
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1195 %ifidn %1, sse
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1196 xorps xmm0, xmm0
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1197 movaps [r1+ 0], xmm0
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1198 movaps [r1+16], xmm0
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1199 %else
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1200 pxor m4, m4
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1201 movq [r1+ 0], m4
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1202 movq [r1+ 8], m4
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1203 movq [r1+16], m4
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1204 movq [r1+24], m4
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1205 %endif
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1206 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1207 TRANSPOSE4x4W 0, 1, 2, 3, 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1208 paddw m0, [pw_3]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1209 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1210 psraw m0, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1211 psraw m1, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1212 psraw m2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1213 psraw m3, 3
12209
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1214 SCATTER_WHT 0, 1, 0
9eef00a43280 Make mmx VP8 WHT faster
darkshikari
parents: 12205
diff changeset
1215 SCATTER_WHT 2, 3, 2
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1216 RET
12340
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1217 %endmacro
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1218
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1219 INIT_MMX
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1220 VP8_DC_WHT mmx
2d15f62f4f8a VP8: move zeroing of luma DC block into the WHT
darkshikari
parents: 12334
diff changeset
1221 VP8_DC_WHT sse
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1222
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1223 ;-----------------------------------------------------------------------------
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1224 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1225 ;-----------------------------------------------------------------------------
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1226
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1227 ; macro called with 7 mm register indexes as argument, and 4 regular registers
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1228 ;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1229 ; first 4 mm registers will carry the transposed pixel data
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1230 ; the other three are scratchspace (one would be sufficient, but this allows
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1231 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1232 ;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1233 ; first two regular registers are buf+4*stride and buf+5*stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1234 ; third is -stride, fourth is +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1235 %macro READ_8x4_INTERLEAVED 11
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1236 ; interleave 8 (A-H) rows of 4 pixels each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1237 movd m%1, [%8+%10*4] ; A0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1238 movd m%5, [%9+%10*4] ; B0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1239 movd m%2, [%8+%10*2] ; C0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1240 movd m%6, [%8+%10] ; D0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1241 movd m%3, [%8] ; E0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1242 movd m%7, [%9] ; F0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1243 movd m%4, [%9+%11] ; G0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1244 punpcklbw m%1, m%5 ; A/B interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1245 movd m%5, [%9+%11*2] ; H0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1246 punpcklbw m%2, m%6 ; C/D interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1247 punpcklbw m%3, m%7 ; E/F interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1248 punpcklbw m%4, m%5 ; G/H interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1249 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1250
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1251 ; macro called with 7 mm register indexes as argument, and 5 regular registers
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1252 ; first 11 mean the same as READ_8x4_TRANSPOSED above
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1253 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1254 ; will be set to second regular register + 8*stride at the end
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1255 %macro READ_16x4_INTERLEAVED 12
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1256 ; transpose 16 (A-P) rows of 4 pixels each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1257 lea %12, [r0+8*r2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1258
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1259 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1260 movd m%1, [%8+%10*4] ; A0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1261 movd m%3, [%12+%10*4] ; I0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1262 movd m%2, [%8+%10*2] ; C0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1263 movd m%4, [%12+%10*2] ; K0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1264 movd m%6, [%8+%10] ; D0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1265 movd m%5, [%12+%10] ; L0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1266 movd m%7, [%12] ; M0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1267 add %12, %11
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1268 punpcklbw m%1, m%3 ; A/I
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1269 movd m%3, [%8] ; E0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1270 punpcklbw m%2, m%4 ; C/K
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1271 punpcklbw m%6, m%5 ; D/L
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1272 punpcklbw m%3, m%7 ; E/M
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1273 punpcklbw m%2, m%6 ; C/D/K/L interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1274
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1275 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1276 movd m%5, [%9+%10*4] ; B0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1277 movd m%4, [%12+%10*4] ; J0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1278 movd m%7, [%9] ; F0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1279 movd m%6, [%12] ; N0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1280 punpcklbw m%5, m%4 ; B/J
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1281 punpcklbw m%7, m%6 ; F/N
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1282 punpcklbw m%1, m%5 ; A/B/I/J interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1283 punpcklbw m%3, m%7 ; E/F/M/N interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1284 movd m%4, [%9+%11] ; G0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1285 movd m%6, [%12+%11] ; O0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1286 movd m%5, [%9+%11*2] ; H0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1287 movd m%7, [%12+%11*2] ; P0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1288 punpcklbw m%4, m%6 ; G/O
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1289 punpcklbw m%5, m%7 ; H/P
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1290 punpcklbw m%4, m%5 ; G/H/O/P interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1291 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1292
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1293 ; write 4 mm registers of 2 dwords each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1294 ; first four arguments are mm register indexes containing source data
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1295 ; last four are registers containing buf+4*stride, buf+5*stride,
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1296 ; -stride and +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1297 %macro WRITE_4x2D 8
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1298 ; write out (2 dwords per register)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1299 movd [%5+%7*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1300 movd [%5+%7*2], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1301 movd [%5], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1302 movd [%6+%8], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1303 punpckhdq m%1, m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1304 punpckhdq m%2, m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1305 punpckhdq m%3, m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1306 punpckhdq m%4, m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1307 movd [%6+%7*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1308 movd [%5+%7], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1309 movd [%6], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1310 movd [%6+%8*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1311 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1312
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1313 ; write 4 xmm registers of 4 dwords each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1314 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1315 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1316 ; we add 1*stride to the third regular registry in the process
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1317 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1318 ; same memory region), or 8 if they cover two separate buffers (third one points to
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1319 ; a different memory region than the first two), allowing for more optimal code for
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1320 ; the 16-width case
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1321 %macro WRITE_4x4D 10
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1322 ; write out (4 dwords per register), start with dwords zero
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1323 movd [%5+%8*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1324 movd [%5], m%2
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1325 movd [%7+%8*4], m%3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1326 movd [%7], m%4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1327
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1328 ; store dwords 1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1329 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1330 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1331 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1332 psrldq m%4, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1333 movd [%6+%8*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1334 movd [%6], m%2
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1335 %if %10 == 16
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1336 movd [%6+%9*4], m%3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1337 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1338 movd [%7+%9], m%4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1339
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1340 ; write dwords 2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1341 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1342 psrldq m%2, 4
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1343 %if %10 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1344 movd [%5+%8*2], m%1
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1345 movd %5d, m%3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1346 %endif
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1347 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1348 psrldq m%4, 4
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1349 %if %10 == 16
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1350 movd [%5+%8*2], m%1
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1351 %endif
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1352 movd [%6+%9], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1353 movd [%7+%8*2], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1354 movd [%7+%9*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1355 add %7, %9
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1356
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1357 ; store dwords 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1358 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1359 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1360 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1361 psrldq m%4, 4
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1362 %if %10 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1363 mov [%7+%8*4], %5d
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1364 movd [%6+%8*2], m%1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1365 %else
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1366 movd [%5+%8], m%1
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1367 %endif
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1368 movd [%6+%9*2], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1369 movd [%7+%8*2], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1370 movd [%7+%9*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1371 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1372
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1373 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1374 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1375 ; for pre-SSE4:
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1376 ; 3 is a general-purpose register that we will clobber
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1377 ; for SSE4:
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1378 ; 3 is a pointer to the destination's 5th line
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1379 ; 4 is a pointer to the destination's 4th line
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1380 ; 5/6 is -stride and +stride
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1381 %macro WRITE_2x4W 6
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1382 movd %3d, %1
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1383 punpckhdq %1, %1
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1384 mov [%4+%5*4], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1385 shr %3, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1386 add %4, %6
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1387 mov [%4+%5*4], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1388
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1389 movd %3d, %1
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1390 add %4, %5
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1391 mov [%4+%5*2], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1392 shr %3, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1393 mov [%4+%5 ], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1394
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1395 movd %3d, %2
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1396 punpckhdq %2, %2
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1397 mov [%4 ], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1398 shr %3, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1399 mov [%4+%6 ], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1400
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1401 movd %3d, %2
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1402 add %4, %6
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1403 mov [%4+%6 ], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1404 shr %3, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1405 mov [%4+%6*2], %3w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1406 add %4, %5
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1407 %endmacro
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1408
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1409 %macro WRITE_8W_SSE2 5
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1410 movd %2d, %1
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1411 psrldq %1, 4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1412 mov [%3+%4*4], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1413 shr %2, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1414 add %3, %5
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1415 mov [%3+%4*4], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1416
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1417 movd %2d, %1
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1418 psrldq %1, 4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1419 add %3, %4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1420 mov [%3+%4*2], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1421 shr %2, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1422 mov [%3+%4 ], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1423
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1424 movd %2d, %1
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1425 psrldq %1, 4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1426 mov [%3 ], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1427 shr %2, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1428 mov [%3+%5 ], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1429
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1430 movd %2d, %1
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1431 add %3, %5
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1432 mov [%3+%5 ], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1433 shr %2, 16
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1434 mov [%3+%5*2], %2w
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1435 %endmacro
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1436
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1437 %macro WRITE_8W_SSE4 5
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1438 pextrw [%3+%4*4], %1, 0
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1439 pextrw [%2+%4*4], %1, 1
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1440 pextrw [%3+%4*2], %1, 2
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1441 pextrw [%3+%4 ], %1, 3
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1442 pextrw [%3 ], %1, 4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1443 pextrw [%2 ], %1, 5
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1444 pextrw [%2+%5 ], %1, 6
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1445 pextrw [%2+%5*2], %1, 7
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1446 %endmacro
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1447
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1448 %macro SPLATB_REG_MMX 2-3
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1449 movd %1, %2d
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1450 punpcklbw %1, %1
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1451 punpcklwd %1, %1
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1452 punpckldq %1, %1
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1453 %endmacro
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1454
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1455 %macro SPLATB_REG_MMXEXT 2-3
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1456 movd %1, %2d
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1457 punpcklbw %1, %1
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1458 pshufw %1, %1, 0x0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1459 %endmacro
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1460
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1461 %macro SPLATB_REG_SSE2 2-3
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1462 movd %1, %2d
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1463 punpcklbw %1, %1
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1464 pshuflw %1, %1, 0x0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1465 punpcklqdq %1, %1
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1466 %endmacro
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1467
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1468 %macro SPLATB_REG_SSSE3 3
12457
2982071047a2 Use "d" suffix for general-purpose registers used with movd.
reimar
parents: 12413
diff changeset
1469 movd %1, %2d
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1470 pshufb %1, %3
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1471 %endmacro
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1472
12413
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1473 %macro SIMPLE_LOOPFILTER 4
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1474 cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1475 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1476 mov r3, 2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1477 %endif
12274
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
1478 %ifnidn %1, sse2
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
1479 %if mmsize == 16
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1480 pxor m0, m0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1481 %endif
12274
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
1482 %endif
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1484
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1485 ; set up indexes to address 4 rows
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1486 mov r2, r1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1487 neg r1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1488 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1489 lea r0, [r0+4*r2-2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1490 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1491
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1492 %if mmsize == 8 ; mmx / mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1493 .next8px
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1494 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1495 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1496 ; read 4 half/full rows of pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1497 mova m0, [r0+r1*2] ; p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1498 mova m1, [r0+r1] ; p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1499 mova m2, [r0] ; q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1500 mova m3, [r0+r2] ; q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1501 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1502 lea r4, [r0+r2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1503
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1504 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1505 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1506 %else ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1507 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1508 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1509 TRANSPOSE4x4W 0, 1, 2, 3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1510 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1511
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1512 ; simple_limit
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1513 mova m5, m2 ; m5=backup of q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1514 mova m6, m1 ; m6=backup of p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1515 psubusb m1, m2 ; p0-q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1516 psubusb m2, m6 ; q0-p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1517 por m1, m2 ; FFABS(p0-q0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1518 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1519
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1520 mova m4, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1521 mova m2, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1522 psubusb m3, m0 ; q1-p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1523 psubusb m0, m4 ; p1-q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1524 por m3, m0 ; FFABS(p1-q1)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1525 mova m0, [pb_80]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1526 pxor m2, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1527 pxor m4, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1528 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1529 pand m3, [pb_FE]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1530 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1531 paddusb m3, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1532 psubusb m3, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1533 pxor m1, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1534 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1535
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1536 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1537 mova m4, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1538 pxor m5, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1539 pxor m0, m6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1540 psubsb m5, m0 ; q0-p0 (signed)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1541 paddsb m2, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1542 paddsb m2, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1543 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1544 pand m2, m3 ; apply filter mask (m3)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1545
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1546 mova m3, [pb_F8]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1547 mova m1, m2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1548 paddsb m2, [pb_4] ; f1<<3=a+4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1549 paddsb m1, [pb_3] ; f2<<3=a+3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1550 pand m2, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1551 pand m1, m3 ; cache f2<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1552
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1553 pxor m0, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1554 pxor m3, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1555 pcmpgtb m0, m2 ; which values are <0?
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1556 psubb m3, m2 ; -f1<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1557 psrlq m2, 3 ; +f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1558 psrlq m3, 3 ; -f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1559 pand m3, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1560 pandn m0, m2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1561 psubusb m4, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1562 paddusb m4, m3 ; q0-f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1563
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1564 pxor m0, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1565 pxor m3, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1566 pcmpgtb m0, m1 ; which values are <0?
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1567 psubb m3, m1 ; -f2<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1568 psrlq m1, 3 ; +f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1569 psrlq m3, 3 ; -f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1570 pand m3, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1571 pandn m0, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1572 paddusb m6, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1573 psubusb m6, m3 ; p0+f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1574
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1575 ; store
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1576 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1577 mova [r0], m4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1578 mova [r0+r1], m6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1579 %else ; h
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1580 inc r0
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1581 SBUTTERFLY bw, 6, 4, 0
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1582
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1583 %if mmsize == 16 ; sse2
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1584 %ifidn %1, sse4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1585 inc r4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1586 %endif
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1587 WRITE_8W m6, r4, r0, r1, r2
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1588 lea r4, [r3+r1+1]
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1589 %ifidn %1, sse4
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1590 inc r3
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1591 %endif
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1592 WRITE_8W m4, r3, r4, r1, r2
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1593 %else ; mmx/mmxext
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1594 WRITE_2x4W m6, m4, r4, r0, r1, r2
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1595 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1596 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1597
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1598 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1599 ; next 8 pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1600 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1601 add r0, 8 ; advance 8 cols = pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1602 %else ; h
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1603 lea r0, [r0+r2*8-1] ; advance 8 rows = lines
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1604 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1605 dec r3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1606 jg .next8px
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1607 REP_RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1608 %else ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1609 RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1610 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1611 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1612
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1613 INIT_MMX
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1614 %define SPLATB_REG SPLATB_REG_MMX
12413
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1615 SIMPLE_LOOPFILTER mmx, v, 4, 0
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1616 SIMPLE_LOOPFILTER mmx, h, 5, 0
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1617 %define SPLATB_REG SPLATB_REG_MMXEXT
12413
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1618 SIMPLE_LOOPFILTER mmxext, v, 4, 0
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1619 SIMPLE_LOOPFILTER mmxext, h, 5, 0
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1620 INIT_XMM
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1621 %define SPLATB_REG SPLATB_REG_SSE2
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1622 %define WRITE_8W WRITE_8W_SSE2
12413
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1623 SIMPLE_LOOPFILTER sse2, v, 3, 8
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1624 SIMPLE_LOOPFILTER sse2, h, 5, 8
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1625 %define SPLATB_REG SPLATB_REG_SSSE3
12413
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1626 SIMPLE_LOOPFILTER ssse3, v, 3, 8
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1627 SIMPLE_LOOPFILTER ssse3, h, 5, 8
12334
435319d67bd8 Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents: 12279
diff changeset
1628 %define WRITE_8W WRITE_8W_SSE4
12413
e6e4059ea421 Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents: 12400
diff changeset
1629 SIMPLE_LOOPFILTER sse4, h, 5, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1630
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1631 ;-----------------------------------------------------------------------------
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1632 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1633 ; int flimE, int flimI, int hev_thr);
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1634 ;-----------------------------------------------------------------------------
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1635
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1636 %macro INNER_LOOPFILTER 5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1637 %if %4 == 8 ; chroma
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1638 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1639 %define dst8_reg r1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1640 %define mstride_reg r2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1641 %define E_reg r3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1642 %define I_reg r4
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1643 %define hev_thr_reg r5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1644 %else ; luma
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1645 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1646 %define mstride_reg r1
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1647 %define E_reg r2
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1648 %define I_reg r3
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1649 %define hev_thr_reg r4
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1650 %ifdef m8 ; x86-64, sse2
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1651 %define dst8_reg r4
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1652 %elif mmsize == 16 ; x86-32, sse2
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1653 %define dst8_reg r5
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1654 %else ; x86-32, mmx/mmxext
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1655 %define cnt_reg r5
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1656 %endif
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1657 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1658 %define dst_reg r0
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1659 %define stride_reg E_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1660 %define dst2_reg I_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1661 %ifndef m8
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1662 %define stack_reg hev_thr_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1663 %endif
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1664
12274
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
1665 %ifnidn %1, sse2
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
1666 %if mmsize == 16
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1667 pxor m7, m7
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1668 %endif
12274
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
1669 %endif
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
1670
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1671 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1672 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1673 SPLATB_REG m0, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1674 SPLATB_REG m1, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1675 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1676
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1677 ; align stack
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1678 mov stack_reg, rsp ; backup stack pointer
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1679 and rsp, ~(mmsize-1) ; align stack
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1680 %ifidn %2, v
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1681 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1682 ; [3]=hev() result
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1683 %else ; h
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1684 sub rsp, mmsize * 5 ; extra storage space for transposes
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1685 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1686
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1687 %define flim_E [rsp]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1688 %define flim_I [rsp+mmsize]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1689 %define hev_thr [rsp+mmsize*2]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1690 %define mask_res [rsp+mmsize*3]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1691 %define p0backup [rsp+mmsize*3]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1692 %define q0backup [rsp+mmsize*4]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1693
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1694 mova flim_E, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1695 mova flim_I, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1696 mova hev_thr, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1697
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1698 %else ; sse2 on x86-64
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1699
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1700 %define flim_E m9
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1701 %define flim_I m10
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1702 %define hev_thr m11
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1703 %define mask_res m12
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1704 %define p0backup m12
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1705 %define q0backup m8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1706
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1707 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1708 SPLATB_REG flim_E, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1709 SPLATB_REG flim_I, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
1710 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1711 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1712
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1713 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1714 mov cnt_reg, 2
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1715 %endif
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1716 mov stride_reg, mstride_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1717 neg mstride_reg
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1718 %ifidn %2, h
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1719 lea dst_reg, [dst_reg + stride_reg*4-4]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1720 %if %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1721 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1722 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1723 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1724
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1725 %if mmsize == 8
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1726 .next8px
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1727 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1728 ; read
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1729 lea dst2_reg, [dst_reg + stride_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1730 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1731 %if %4 == 8 && mmsize == 16
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1732 %define movrow movh
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1733 %else
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1734 %define movrow mova
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1735 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1736 movrow m0, [dst_reg +mstride_reg*4] ; p3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1737 movrow m1, [dst2_reg+mstride_reg*4] ; p2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1738 movrow m2, [dst_reg +mstride_reg*2] ; p1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1739 movrow m5, [dst2_reg] ; q1
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1740 movrow m6, [dst2_reg+ stride_reg] ; q2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1741 movrow m7, [dst2_reg+ stride_reg*2] ; q3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1742 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1743 movhps m0, [dst8_reg+mstride_reg*4]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1744 movhps m2, [dst8_reg+mstride_reg*2]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1745 add dst8_reg, stride_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1746 movhps m1, [dst8_reg+mstride_reg*4]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1747 movhps m5, [dst8_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1748 movhps m6, [dst8_reg+ stride_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1749 movhps m7, [dst8_reg+ stride_reg*2]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1750 add dst8_reg, mstride_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1751 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1752 %elif mmsize == 8 ; mmx/mmxext (h)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1753 ; read 8 rows of 8px each
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1754 movu m0, [dst_reg +mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1755 movu m1, [dst2_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1756 movu m2, [dst_reg +mstride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1757 movu m3, [dst_reg +mstride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1758 movu m4, [dst_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1759 movu m5, [dst2_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1760 movu m6, [dst2_reg+ stride_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1761
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1762 ; 8x8 transpose
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1763 TRANSPOSE4x4B 0, 1, 2, 3, 7
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1764 mova q0backup, m1
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1765 movu m7, [dst2_reg+ stride_reg*2]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1766 TRANSPOSE4x4B 4, 5, 6, 7, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1767 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1768 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1769 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1770 mova m1, q0backup
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1771 mova q0backup, m2 ; store q0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1772 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1773 mova p0backup, m5 ; store p0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1774 SWAP 1, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1775 SWAP 2, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1776 SWAP 6, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1777 SWAP 5, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1778 %else ; sse2 (h)
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1779 %if %4 == 16
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1780 lea dst8_reg, [dst_reg + stride_reg*8]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1781 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1782
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1783 ; read 16 rows of 8px each, interleave
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1784 movh m0, [dst_reg +mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1785 movh m1, [dst8_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1786 movh m2, [dst_reg +mstride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1787 movh m5, [dst8_reg+mstride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1788 movh m3, [dst_reg +mstride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1789 movh m6, [dst8_reg+mstride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1790 movh m4, [dst_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1791 movh m7, [dst8_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1792 punpcklbw m0, m1 ; A/I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1793 punpcklbw m2, m5 ; C/K
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1794 punpcklbw m3, m6 ; D/L
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1795 punpcklbw m4, m7 ; E/M
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1796
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1797 add dst8_reg, stride_reg
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1798 movh m1, [dst2_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1799 movh m6, [dst8_reg+mstride_reg*4]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1800 movh m5, [dst2_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1801 movh m7, [dst8_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1802 punpcklbw m1, m6 ; B/J
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1803 punpcklbw m5, m7 ; F/N
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1804 movh m6, [dst2_reg+ stride_reg]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1805 movh m7, [dst8_reg+ stride_reg]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1806 punpcklbw m6, m7 ; G/O
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1807
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1808 ; 8x16 transpose
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1809 TRANSPOSE4x4B 0, 1, 2, 3, 7
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1810 %ifdef m8
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1811 SWAP 1, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1812 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1813 mova q0backup, m1
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1814 %endif
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1815 movh m7, [dst2_reg+ stride_reg*2]
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
1816 movh m1, [dst8_reg+ stride_reg*2]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1817 punpcklbw m7, m1 ; H/P
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1818 TRANSPOSE4x4B 4, 5, 6, 7, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1819 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1820 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1821 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1822 %ifdef m8
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1823 SWAP 1, 8
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1824 SWAP 2, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1825 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1826 mova m1, q0backup
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1827 mova q0backup, m2 ; store q0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1828 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1829 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1830 %ifdef m12
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1831 SWAP 5, 12
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1832 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1833 mova p0backup, m5 ; store p0
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1834 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1835 SWAP 1, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1836 SWAP 2, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1837 SWAP 6, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1838 SWAP 5, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1839 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1840
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1841 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1842 mova m4, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1843 SWAP 4, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1844 psubusb m4, m0 ; p2-p3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1845 psubusb m0, m1 ; p3-p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1846 por m0, m4 ; abs(p3-p2)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1847
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1848 mova m4, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1849 SWAP 4, 2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1850 psubusb m4, m1 ; p1-p2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1851 psubusb m1, m2 ; p2-p1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1852 por m1, m4 ; abs(p2-p1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1853
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1854 mova m4, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1855 SWAP 4, 6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1856 psubusb m4, m7 ; q2-q3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1857 psubusb m7, m6 ; q3-q2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1858 por m7, m4 ; abs(q3-q2)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1859
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1860 mova m4, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1861 SWAP 4, 5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1862 psubusb m4, m6 ; q1-q2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1863 psubusb m6, m5 ; q2-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1864 por m6, m4 ; abs(q2-q1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1865
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1866 %ifidn %1, mmx
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1867 mova m4, flim_I
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1868 pxor m3, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1869 psubusb m0, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1870 psubusb m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1871 psubusb m7, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1872 psubusb m6, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1873 pcmpeqb m0, m3 ; abs(p3-p2) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1874 pcmpeqb m1, m3 ; abs(p2-p1) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1875 pcmpeqb m7, m3 ; abs(q3-q2) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1876 pcmpeqb m6, m3 ; abs(q2-q1) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1877 pand m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1878 pand m7, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1879 pand m0, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1880 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1881 pmaxub m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1882 pmaxub m6, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1883 pmaxub m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1884 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1885
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1886 ; normal_limit and high_edge_variance for p1-p0, q1-q0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1887 SWAP 7, 3 ; now m7 is zero
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1888 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1889 movrow m3, [dst_reg +mstride_reg] ; p0
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1890 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1891 movhps m3, [dst8_reg+mstride_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1892 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1893 %elifdef m12
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1894 SWAP 3, 12
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1895 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1896 mova m3, p0backup
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1897 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1898
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1899 mova m1, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1900 SWAP 1, 2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1901 mova m6, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1902 SWAP 3, 6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1903 psubusb m1, m3 ; p1-p0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1904 psubusb m6, m2 ; p0-p1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1905 por m1, m6 ; abs(p1-p0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1906 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1907 mova m6, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1908 psubusb m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1909 psubusb m6, hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1910 pcmpeqb m1, m7 ; abs(p1-p0) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1911 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1912 pand m0, m1
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1913 mova mask_res, m6
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1914 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1915 pmaxub m0, m1 ; max_I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1916 SWAP 1, 4 ; max_hev_thresh
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1917 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1918
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1919 SWAP 6, 4 ; now m6 is I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1920 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1921 movrow m4, [dst_reg] ; q0
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1922 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1923 movhps m4, [dst8_reg]
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1924 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1925 %elifdef m8
12195
e7847fcff0f4 Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents: 12194
diff changeset
1926 SWAP 4, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1927 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1928 mova m4, q0backup
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1929 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1930 mova m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1931 SWAP 1, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1932 mova m7, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1933 SWAP 7, 5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1934 psubusb m1, m5 ; q0-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1935 psubusb m7, m4 ; q1-q0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1936 por m1, m7 ; abs(q1-q0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1937 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1938 mova m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1939 psubusb m1, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1940 psubusb m7, hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1941 pxor m6, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1942 pcmpeqb m1, m6 ; abs(q1-q0) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1943 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1944 mova m6, mask_res
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1945 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1946 pand m6, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1947 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1948 pxor m7, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1949 pmaxub m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1950 pmaxub m6, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1951 psubusb m0, flim_I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1952 psubusb m6, hev_thr
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1953 pcmpeqb m0, m7 ; max(abs(..)) <= I
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1954 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1955 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1956 %ifdef m12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1957 SWAP 6, 12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1958 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
1959 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1960 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1961
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1962 ; simple_limit
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1963 mova m1, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1964 SWAP 1, 3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1965 mova m6, m4 ; keep copies of p0/q0 around for later use
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1966 SWAP 6, 4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1967 psubusb m1, m4 ; p0-q0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1968 psubusb m6, m3 ; q0-p0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1969 por m1, m6 ; abs(q0-p0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1970 paddusb m1, m1 ; m1=2*abs(q0-p0)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1971
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1972 mova m7, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1973 SWAP 7, 2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1974 mova m6, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1975 SWAP 6, 5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1976 psubusb m7, m5 ; p1-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1977 psubusb m6, m2 ; q1-p1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1978 por m7, m6 ; abs(q1-p1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1979 pxor m6, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1980 pand m7, [pb_FE]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1981 psrlq m7, 1 ; abs(q1-p1)/2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1982 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1983 psubusb m7, flim_E
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1984 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1985 pand m0, m7 ; normal_limit result
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1986
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1987 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1988 %ifdef m8 ; x86-64 && sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1989 mova m8, [pb_80]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1990 %define pb_80_var m8
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1991 %else ; x86-32 or mmx/mmxext
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1992 %define pb_80_var [pb_80]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1993 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1994 mova m1, m4
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1995 mova m7, m3
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1996 pxor m1, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1997 pxor m7, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1998 psubsb m1, m7 ; (signed) q0-p0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
1999 mova m6, m2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2000 mova m7, m5
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2001 pxor m6, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2002 pxor m7, pb_80_var
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2003 psubsb m6, m7 ; (signed) p1-q1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2004 mova m7, mask_res
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2005 pandn m7, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2006 paddsb m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2007 paddsb m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2008 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2009
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2010 pand m7, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2011 mova m1, [pb_F8]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2012 mova m6, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2013 paddsb m7, [pb_3]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2014 paddsb m6, [pb_4]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2015 pand m7, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2016 pand m6, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2017
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2018 pxor m1, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2019 pxor m0, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2020 pcmpgtb m1, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2021 psubb m0, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2022 psrlq m7, 3 ; +f2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2023 psrlq m0, 3 ; -f2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2024 pand m0, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2025 pandn m1, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2026 psubusb m3, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2027 paddusb m3, m1 ; p0+f2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2028
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2029 pxor m1, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2030 pxor m0, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2031 pcmpgtb m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2032 psubb m1, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2033 psrlq m6, 3 ; +f1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2034 psrlq m1, 3 ; -f1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2035 pand m1, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2036 pandn m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2037 psubusb m4, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2038 paddusb m4, m1 ; q0-f1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2039
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2040 %ifdef m12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2041 SWAP 6, 12
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2042 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2043 mova m6, mask_res
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2044 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2045 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2046 mova m7, [pb_1]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2047 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2048 pxor m7, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2049 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2050 pand m0, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2051 pand m1, m6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2052 %ifidn %1, mmx
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2053 paddusb m0, m7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2054 pand m1, [pb_FE]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2055 pandn m7, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2056 psrlq m1, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2057 psrlq m7, 1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2058 SWAP 0, 7
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2059 %else ; mmxext/sse2
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2060 psubusb m1, [pb_1]
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2061 pavgb m0, m7 ; a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2062 pavgb m1, m7 ; -a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2063 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2064 psubusb m5, m0
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2065 psubusb m2, m1
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2066 paddusb m5, m1 ; q1-a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2067 paddusb m2, m0 ; p1+a
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2068
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2069 ; store
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2070 %ifidn %2, v
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2071 movrow [dst_reg +mstride_reg*2], m2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2072 movrow [dst_reg +mstride_reg ], m3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2073 movrow [dst_reg], m4
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2074 movrow [dst_reg + stride_reg ], m5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2075 %if mmsize == 16 && %4 == 8
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2076 movhps [dst8_reg+mstride_reg*2], m2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2077 movhps [dst8_reg+mstride_reg ], m3
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2078 movhps [dst8_reg], m4
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2079 movhps [dst8_reg+ stride_reg ], m5
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2080 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2081 %else ; h
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2082 add dst_reg, 2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2083 add dst2_reg, 2
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2084
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2085 ; 4x8/16 transpose
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2086 TRANSPOSE4x4B 2, 3, 4, 5, 6
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2087
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2088 %if mmsize == 8 ; mmx/mmxext (h)
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2089 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2090 %else ; sse2 (h)
12180
b24153464669 Attempt to fix x86-64 testsuite on fate.
rbultje
parents: 12177
diff changeset
2091 lea dst8_reg, [dst8_reg+mstride_reg+2]
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2092 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2093 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2094 %endif
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2095
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2096 %if mmsize == 8
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2097 %if %4 == 8 ; chroma
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2098 %ifidn %2, h
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2099 sub dst_reg, 2
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2100 %endif
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2101 cmp dst_reg, dst8_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2102 mov dst_reg, dst8_reg
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2103 jnz .next8px
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2104 %else
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2105 %ifidn %2, h
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2106 lea dst_reg, [dst_reg + stride_reg*8-2]
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2107 %else ; v
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2108 add dst_reg, 8
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2109 %endif
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2110 dec cnt_reg
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2111 jg .next8px
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2112 %endif
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2113 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2114
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2115 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2116 mov rsp, stack_reg ; restore stack pointer
12173
c47ddb7df424 Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents: 12168
diff changeset
2117 %endif
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2118 RET
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2119 %endmacro
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2120
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2121 INIT_MMX
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2122 %define SPLATB_REG SPLATB_REG_MMX
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2123 INNER_LOOPFILTER mmx, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2124 INNER_LOOPFILTER mmx, h, 6, 16, 0
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2125 INNER_LOOPFILTER mmx, v, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2126 INNER_LOOPFILTER mmx, h, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2127
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2128 %define SPLATB_REG SPLATB_REG_MMXEXT
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2129 INNER_LOOPFILTER mmxext, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2130 INNER_LOOPFILTER mmxext, h, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2131 INNER_LOOPFILTER mmxext, v, 6, 8, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2132 INNER_LOOPFILTER mmxext, h, 6, 8, 0
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2133
12168
b246b214c2e9 VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents: 12086
diff changeset
2134 INIT_XMM
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2135 %define SPLATB_REG SPLATB_REG_SSE2
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2136 INNER_LOOPFILTER sse2, v, 5, 16, 13
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2137 %ifdef m8
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2138 INNER_LOOPFILTER sse2, h, 5, 16, 13
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2139 %else
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2140 INNER_LOOPFILTER sse2, h, 6, 16, 13
12174
57038190cc5f Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents: 12173
diff changeset
2141 %endif
12204
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2142 INNER_LOOPFILTER sse2, v, 6, 8, 13
563339ea87aa Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents: 12198
diff changeset
2143 INNER_LOOPFILTER sse2, h, 6, 8, 13
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2144
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2145 %define SPLATB_REG SPLATB_REG_SSSE3
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2146 INNER_LOOPFILTER ssse3, v, 5, 16, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2147 %ifdef m8
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2148 INNER_LOOPFILTER ssse3, h, 5, 16, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2149 %else
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2150 INNER_LOOPFILTER ssse3, h, 6, 16, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2151 %endif
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2152 INNER_LOOPFILTER ssse3, v, 6, 8, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2153 INNER_LOOPFILTER ssse3, h, 6, 8, 13
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2154
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2155 ;-----------------------------------------------------------------------------
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2156 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2157 ; int flimE, int flimI, int hev_thr);
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2158 ;-----------------------------------------------------------------------------
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2159
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2160 %macro MBEDGE_LOOPFILTER 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2161 %if %4 == 8 ; chroma
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2162 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2163 %define dst8_reg r1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2164 %define mstride_reg r2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2165 %define E_reg r3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2166 %define I_reg r4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2167 %define hev_thr_reg r5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2168 %else ; luma
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2169 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2170 %define mstride_reg r1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2171 %define E_reg r2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2172 %define I_reg r3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2173 %define hev_thr_reg r4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2174 %ifdef m8 ; x86-64, sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2175 %define dst8_reg r4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2176 %elif mmsize == 16 ; x86-32, sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2177 %define dst8_reg r5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2178 %else ; x86-32, mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2179 %define cnt_reg r5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2180 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2181 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2182 %define dst_reg r0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2183 %define stride_reg E_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2184 %define dst2_reg I_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2185 %ifndef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2186 %define stack_reg hev_thr_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2187 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2188
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2189 %define ssse3_or_higher 0
12274
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
2190 %ifnidn %1, sse2
1d207bb5cd29 Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents: 12272
diff changeset
2191 %if mmsize == 16
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2192 %define ssse3_or_higher 1
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2193 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2194 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2195
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2196 %if ssse3_or_higher
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2197 pxor m7, m7
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2198 %endif
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2199
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2200 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2201 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2202 SPLATB_REG m0, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2203 SPLATB_REG m1, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2204 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2206 ; align stack
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2207 mov stack_reg, rsp ; backup stack pointer
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2208 and rsp, ~(mmsize-1) ; align stack
12276
1c299b8f2930 Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents: 12275
diff changeset
2209 %if mmsize == 16
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2210 sub rsp, mmsize * 7
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2211 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2212 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2213 ; [3]=hev() result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2214 ; [4]=filter tmp result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2215 ; [5]/[6] = p2/q2 backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2216 ; [7]=lim_res sign result
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2217 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2218
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2219 %define flim_E [rsp]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2220 %define flim_I [rsp+mmsize]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2221 %define hev_thr [rsp+mmsize*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2222 %define mask_res [rsp+mmsize*3]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2223 %define lim_res [rsp+mmsize*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2224 %define p0backup [rsp+mmsize*3]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2225 %define q0backup [rsp+mmsize*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2226 %define p2backup [rsp+mmsize*5]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2227 %define q2backup [rsp+mmsize*6]
12276
1c299b8f2930 Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents: 12275
diff changeset
2228 %if mmsize == 16
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2229 %define lim_sign [rsp]
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2230 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2231 %define lim_sign [rsp+mmsize*7]
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2232 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2233
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2234 mova flim_E, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2235 mova flim_I, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2236 mova hev_thr, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2237
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2238 %else ; sse2 on x86-64
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2239
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2240 %define flim_E m9
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2241 %define flim_I m10
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2242 %define hev_thr m11
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2243 %define mask_res m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2244 %define lim_res m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2245 %define p0backup m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2246 %define q0backup m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2247 %define p2backup m13
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2248 %define q2backup m14
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2249 %define lim_sign m9
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2250
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2251 ; splat function arguments
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2252 SPLATB_REG flim_E, E_reg, m7 ; E
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2253 SPLATB_REG flim_I, I_reg, m7 ; I
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2254 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2255 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2256
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2257 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2258 mov cnt_reg, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2259 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2260 mov stride_reg, mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2261 neg mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2262 %ifidn %2, h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2263 lea dst_reg, [dst_reg + stride_reg*4-4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2264 %if %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2265 lea dst8_reg, [dst8_reg+ stride_reg*4-4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2266 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2267 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2268
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2269 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2270 .next8px
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2271 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2272 ; read
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2273 lea dst2_reg, [dst_reg + stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2274 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2275 %if %4 == 8 && mmsize == 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2276 %define movrow movh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2277 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2278 %define movrow mova
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2279 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2280 movrow m0, [dst_reg +mstride_reg*4] ; p3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2281 movrow m1, [dst2_reg+mstride_reg*4] ; p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2282 movrow m2, [dst_reg +mstride_reg*2] ; p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2283 movrow m5, [dst2_reg] ; q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2284 movrow m6, [dst2_reg+ stride_reg] ; q2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2285 movrow m7, [dst2_reg+ stride_reg*2] ; q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2286 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2287 movhps m0, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2288 movhps m2, [dst8_reg+mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2289 add dst8_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2290 movhps m1, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2291 movhps m5, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2292 movhps m6, [dst8_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2293 movhps m7, [dst8_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2294 add dst8_reg, mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2295 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2296 %elif mmsize == 8 ; mmx/mmxext (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2297 ; read 8 rows of 8px each
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2298 movu m0, [dst_reg +mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2299 movu m1, [dst2_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2300 movu m2, [dst_reg +mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2301 movu m3, [dst_reg +mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2302 movu m4, [dst_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2303 movu m5, [dst2_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2304 movu m6, [dst2_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2305
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2306 ; 8x8 transpose
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2307 TRANSPOSE4x4B 0, 1, 2, 3, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2308 mova q0backup, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2309 movu m7, [dst2_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2310 TRANSPOSE4x4B 4, 5, 6, 7, 1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2311 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2312 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2313 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2314 mova m1, q0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2315 mova q0backup, m2 ; store q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2316 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2317 mova p0backup, m5 ; store p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2318 SWAP 1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2319 SWAP 2, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2320 SWAP 6, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2321 SWAP 5, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2322 %else ; sse2 (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2323 %if %4 == 16
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2324 lea dst8_reg, [dst_reg + stride_reg*8]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2325 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2326
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2327 ; read 16 rows of 8px each, interleave
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2328 movh m0, [dst_reg +mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2329 movh m1, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2330 movh m2, [dst_reg +mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2331 movh m5, [dst8_reg+mstride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2332 movh m3, [dst_reg +mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2333 movh m6, [dst8_reg+mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2334 movh m4, [dst_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2335 movh m7, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2336 punpcklbw m0, m1 ; A/I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2337 punpcklbw m2, m5 ; C/K
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2338 punpcklbw m3, m6 ; D/L
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2339 punpcklbw m4, m7 ; E/M
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2340
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2341 add dst8_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2342 movh m1, [dst2_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2343 movh m6, [dst8_reg+mstride_reg*4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2344 movh m5, [dst2_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2345 movh m7, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2346 punpcklbw m1, m6 ; B/J
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2347 punpcklbw m5, m7 ; F/N
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2348 movh m6, [dst2_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2349 movh m7, [dst8_reg+ stride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2350 punpcklbw m6, m7 ; G/O
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2351
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2352 ; 8x16 transpose
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2353 TRANSPOSE4x4B 0, 1, 2, 3, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2354 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2355 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2356 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2357 mova q0backup, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2358 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2359 movh m7, [dst2_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2360 movh m1, [dst8_reg+ stride_reg*2]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2361 punpcklbw m7, m1 ; H/P
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2362 TRANSPOSE4x4B 4, 5, 6, 7, 1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2363 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2364 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2365 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2366 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2367 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2368 SWAP 2, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2369 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2370 mova m1, q0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2371 mova q0backup, m2 ; store q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2372 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2373 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2374 %ifdef m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2375 SWAP 5, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2376 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2377 mova p0backup, m5 ; store p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2378 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2379 SWAP 1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2380 SWAP 2, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2381 SWAP 6, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2382 SWAP 5, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2383 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2384
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2385 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2386 mova m4, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2387 SWAP 4, 1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2388 psubusb m4, m0 ; p2-p3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2389 psubusb m0, m1 ; p3-p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2390 por m0, m4 ; abs(p3-p2)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2391
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2392 mova m4, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2393 SWAP 4, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2394 psubusb m4, m1 ; p1-p2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2395 mova p2backup, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2396 psubusb m1, m2 ; p2-p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2397 por m1, m4 ; abs(p2-p1)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2398
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2399 mova m4, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2400 SWAP 4, 6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2401 psubusb m4, m7 ; q2-q3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2402 psubusb m7, m6 ; q3-q2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2403 por m7, m4 ; abs(q3-q2)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2404
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2405 mova m4, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2406 SWAP 4, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2407 psubusb m4, m6 ; q1-q2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2408 mova q2backup, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2409 psubusb m6, m5 ; q2-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2410 por m6, m4 ; abs(q2-q1)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2411
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2412 %ifidn %1, mmx
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2413 mova m4, flim_I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2414 pxor m3, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2415 psubusb m0, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2416 psubusb m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2417 psubusb m7, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2418 psubusb m6, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2419 pcmpeqb m0, m3 ; abs(p3-p2) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2420 pcmpeqb m1, m3 ; abs(p2-p1) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2421 pcmpeqb m7, m3 ; abs(q3-q2) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2422 pcmpeqb m6, m3 ; abs(q2-q1) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2423 pand m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2424 pand m7, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2425 pand m0, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2426 %else ; mmxext/sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2427 pmaxub m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2428 pmaxub m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2429 pmaxub m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2430 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2431
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2432 ; normal_limit and high_edge_variance for p1-p0, q1-q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2433 SWAP 7, 3 ; now m7 is zero
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2434 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2435 movrow m3, [dst_reg +mstride_reg] ; p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2436 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2437 movhps m3, [dst8_reg+mstride_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2438 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2439 %elifdef m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2440 SWAP 3, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2441 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2442 mova m3, p0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2443 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2444
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2445 mova m1, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2446 SWAP 1, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2447 mova m6, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2448 SWAP 3, 6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2449 psubusb m1, m3 ; p1-p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2450 psubusb m6, m2 ; p0-p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2451 por m1, m6 ; abs(p1-p0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2452 %ifidn %1, mmx
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2453 mova m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2454 psubusb m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2455 psubusb m6, hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2456 pcmpeqb m1, m7 ; abs(p1-p0) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2457 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2458 pand m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2459 mova mask_res, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2460 %else ; mmxext/sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2461 pmaxub m0, m1 ; max_I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2462 SWAP 1, 4 ; max_hev_thresh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2463 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2464
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2465 SWAP 6, 4 ; now m6 is I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2466 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2467 movrow m4, [dst_reg] ; q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2468 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2469 movhps m4, [dst8_reg]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2470 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2471 %elifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2472 SWAP 4, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2473 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2474 mova m4, q0backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2475 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2476 mova m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2477 SWAP 1, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2478 mova m7, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2479 SWAP 7, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2480 psubusb m1, m5 ; q0-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2481 psubusb m7, m4 ; q1-q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2482 por m1, m7 ; abs(q1-q0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2483 %ifidn %1, mmx
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2484 mova m7, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2485 psubusb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2486 psubusb m7, hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2487 pxor m6, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2488 pcmpeqb m1, m6 ; abs(q1-q0) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2489 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2490 mova m6, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2491 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2492 pand m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2493 %else ; mmxext/sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2494 pxor m7, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2495 pmaxub m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2496 pmaxub m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2497 psubusb m0, flim_I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2498 psubusb m6, hev_thr
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2499 pcmpeqb m0, m7 ; max(abs(..)) <= I
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2500 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2501 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2502 %ifdef m12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2503 SWAP 6, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2504 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2505 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2506 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2507
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2508 ; simple_limit
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2509 mova m1, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2510 SWAP 1, 3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2511 mova m6, m4 ; keep copies of p0/q0 around for later use
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2512 SWAP 6, 4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2513 psubusb m1, m4 ; p0-q0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2514 psubusb m6, m3 ; q0-p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2515 por m1, m6 ; abs(q0-p0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2516 paddusb m1, m1 ; m1=2*abs(q0-p0)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2517
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2518 mova m7, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2519 SWAP 7, 2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2520 mova m6, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2521 SWAP 6, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2522 psubusb m7, m5 ; p1-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2523 psubusb m6, m2 ; q1-p1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2524 por m7, m6 ; abs(q1-p1)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2525 pxor m6, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2526 pand m7, [pb_FE]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2527 psrlq m7, 1 ; abs(q1-p1)/2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2528 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2529 psubusb m7, flim_E
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2530 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2531 pand m0, m7 ; normal_limit result
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2532
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2533 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2534 %ifdef m8 ; x86-64 && sse2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2535 mova m8, [pb_80]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2536 %define pb_80_var m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2537 %else ; x86-32 or mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2538 %define pb_80_var [pb_80]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2539 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2540 mova m1, m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2541 mova m7, m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2542 pxor m1, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2543 pxor m7, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2544 psubsb m1, m7 ; (signed) q0-p0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2545 mova m6, m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2546 mova m7, m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2547 pxor m6, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2548 pxor m7, pb_80_var
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2549 psubsb m6, m7 ; (signed) p1-q1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2550 mova m7, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2551 paddsb m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2552 paddsb m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2553 paddsb m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2554 pand m6, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2555 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2556 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2557 pand lim_res, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2558 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2559 mova m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2560 pand m0, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2561 mova lim_res, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2562 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2563 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2564
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2565 mova m1, [pb_F8]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2566 mova m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2567 paddsb m7, [pb_3]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2568 paddsb m6, [pb_4]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2569 pand m7, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2570 pand m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2571
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2572 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2573 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2574 pcmpgtb m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2575 psubb m0, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2576 psrlq m7, 3 ; +f2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2577 psrlq m0, 3 ; -f2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2578 pand m0, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2579 pandn m1, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2580 psubusb m3, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2581 paddusb m3, m1 ; p0+f2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2582
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2583 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2584 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2585 pcmpgtb m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2586 psubb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2587 psrlq m6, 3 ; +f1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2588 psrlq m1, 3 ; -f1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2589 pand m1, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2590 pandn m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2591 psubusb m4, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2592 paddusb m4, m1 ; q0-f1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2593
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2594 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2595 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2596 mova m7, [pb_1]
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2597 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2598 mova m7, [pw_63]
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2599 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2600 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2601 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2602 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2603 mova m1, lim_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2604 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2605 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2606 mova m6, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2607 pcmpgtb m0, m1 ; which are negative
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2608 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2609 punpcklbw m6, m7 ; interleave with "1" for rounding
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2610 punpckhbw m1, m7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2611 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2612 punpcklbw m6, m0 ; signed byte->word
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2613 punpckhbw m1, m0
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2614 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2615 mova lim_sign, m0
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2616 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2617 mova m7, [pb_27_63]
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2618 %ifndef m8
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2619 mova lim_res, m1
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2620 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2621 %ifdef m10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2622 SWAP 0, 10 ; don't lose lim_sign copy
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2623 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2624 mova m0, m7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2625 pmaddubsw m7, m6
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2626 SWAP 6, 7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2627 pmaddubsw m0, m1
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2628 SWAP 1, 0
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2629 %ifdef m10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2630 SWAP 0, 10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2631 %else
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2632 mova m0, lim_sign
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2633 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2634 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2635 mova mask_res, m6 ; backup for later in filter
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2636 mova lim_res, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2637 pmullw m6, [pw_27]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2638 pmullw m1, [pw_27]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2639 paddw m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2640 paddw m1, m7
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2641 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2642 psraw m6, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2643 psraw m1, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2644 packsswb m6, m1 ; a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2645 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2646 psubb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2647 pand m1, m0 ; -a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2648 pandn m0, m6 ; +a0
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2649 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2650 mova m6, [pb_18_63] ; pipelining
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2651 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2652 psubusb m3, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2653 paddusb m4, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2654 paddusb m3, m0 ; p0+a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2655 psubusb m4, m0 ; q0-a0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2656
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2657 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2658 SWAP 6, 7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2659 %ifdef m10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2660 SWAP 1, 10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2661 %else
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2662 mova m1, lim_res
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2663 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2664 mova m0, m7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2665 pmaddubsw m7, m6
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2666 SWAP 6, 7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2667 pmaddubsw m0, m1
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2668 SWAP 1, 0
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2669 %ifdef m10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2670 SWAP 0, 10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2671 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2672 mova m0, lim_sign
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2673 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2674 mova m6, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2675 mova m1, lim_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2676 pmullw m6, [pw_18]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2677 pmullw m1, [pw_18]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2678 paddw m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2679 paddw m1, m7
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2680 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2681 mova m0, lim_sign
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2682 psraw m6, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2683 psraw m1, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2684 packsswb m6, m1 ; a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2685 pxor m1, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2686 psubb m1, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2687 pand m1, m0 ; -a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2688 pandn m0, m6 ; +a1
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2689 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2690 mova m6, [pb_9_63]
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2691 %endif
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2692 psubusb m2, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2693 paddusb m5, m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2694 paddusb m2, m0 ; p1+a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2695 psubusb m5, m0 ; q1-a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2696
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2697 %if ssse3_or_higher
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2698 SWAP 6, 7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2699 %ifdef m10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2700 SWAP 1, 10
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2701 %else
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2702 mova m1, lim_res
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2703 %endif
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2704 mova m0, m7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2705 pmaddubsw m7, m6
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2706 SWAP 6, 7
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2707 pmaddubsw m0, m1
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2708 SWAP 1, 0
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2709 %else
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2710 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2711 SWAP 6, 12
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2712 SWAP 1, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2713 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2714 mova m6, mask_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2715 mova m1, lim_res
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2716 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2717 pmullw m6, [pw_9]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2718 pmullw m1, [pw_9]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2719 paddw m6, m7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2720 paddw m1, m7
12279
7fb91885433c Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents: 12278
diff changeset
2721 %endif
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2722 %ifdef m9
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2723 SWAP 7, 9
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2724 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2725 mova m7, lim_sign
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2726 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2727 psraw m6, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2728 psraw m1, 7
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2729 packsswb m6, m1 ; a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2730 pxor m0, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2731 psubb m0, m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2732 pand m0, m7 ; -a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2733 pandn m7, m6 ; +a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2734 %ifdef m8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2735 SWAP 1, 13
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2736 SWAP 6, 14
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2737 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2738 mova m1, p2backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2739 mova m6, q2backup
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2740 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2741 psubusb m1, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2742 paddusb m6, m0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2743 paddusb m1, m7 ; p1+a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2744 psubusb m6, m7 ; q1-a1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2745
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2746 ; store
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2747 %ifidn %2, v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2748 movrow [dst2_reg+mstride_reg*4], m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2749 movrow [dst_reg +mstride_reg*2], m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2750 movrow [dst_reg +mstride_reg ], m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2751 movrow [dst_reg], m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2752 movrow [dst2_reg], m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2753 movrow [dst2_reg+ stride_reg ], m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2754 %if mmsize == 16 && %4 == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2755 add dst8_reg, mstride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2756 movhps [dst8_reg+mstride_reg*2], m1
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2757 movhps [dst8_reg+mstride_reg ], m2
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2758 movhps [dst8_reg], m3
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2759 add dst8_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2760 movhps [dst8_reg], m4
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2761 movhps [dst8_reg+ stride_reg ], m5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2762 movhps [dst8_reg+ stride_reg*2], m6
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2763 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2764 %else ; h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2765 inc dst_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2766 inc dst2_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2767
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2768 ; 4x8/16 transpose
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2769 TRANSPOSE4x4B 1, 2, 3, 4, 0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2770 SBUTTERFLY bw, 5, 6, 0
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2771
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2772 %if mmsize == 8 ; mmx/mmxext (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2773 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2774 add dst_reg, 4
12272
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2775 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2776 %else ; sse2 (h)
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2777 lea dst8_reg, [dst8_reg+mstride_reg+1]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2778 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
12214
657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents: 12211
diff changeset
2779 lea dst_reg, [dst2_reg+mstride_reg+4]
657d353cd515 Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents: 12211
diff changeset
2780 lea dst8_reg, [dst8_reg+mstride_reg+4]
12272
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2781 %ifidn %1, sse4
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2782 add dst2_reg, 4
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2783 %endif
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2784 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2785 %ifidn %1, sse4
12268
259988e7ad0f Fix obvious bug in assignment. Somehow, the test vectors don't test this...
rbultje
parents: 12266
diff changeset
2786 lea dst2_reg, [dst8_reg+ stride_reg]
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2787 %endif
12272
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2788 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2789 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2790 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2791
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2792 %if mmsize == 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2793 %if %4 == 8 ; chroma
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2794 %ifidn %2, h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2795 sub dst_reg, 5
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2796 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2797 cmp dst_reg, dst8_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2798 mov dst_reg, dst8_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2799 jnz .next8px
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2800 %else
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2801 %ifidn %2, h
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2802 lea dst_reg, [dst_reg + stride_reg*8-5]
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2803 %else ; v
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2804 add dst_reg, 8
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2805 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2806 dec cnt_reg
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2807 jg .next8px
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2808 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2809 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2810
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2811 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2812 mov rsp, stack_reg ; restore stack pointer
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2813 %endif
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2814 RET
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2815 %endmacro
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2816
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2817 INIT_MMX
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2818 %define SPLATB_REG SPLATB_REG_MMX
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2819 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2820 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2821 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2822 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2823
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2824 %define SPLATB_REG SPLATB_REG_MMXEXT
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2825 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2826 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2827 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2828 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2829
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2830 INIT_XMM
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2831 %define SPLATB_REG SPLATB_REG_SSE2
12272
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2832 %define WRITE_8W WRITE_8W_SSE2
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2833 MBEDGE_LOOPFILTER sse2, v, 5, 16, 15
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2834 %ifdef m8
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2835 MBEDGE_LOOPFILTER sse2, h, 5, 16, 15
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2836 %else
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2837 MBEDGE_LOOPFILTER sse2, h, 6, 16, 15
12205
d38e8565ba05 VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents: 12204
diff changeset
2838 %endif
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2839 MBEDGE_LOOPFILTER sse2, v, 6, 8, 15
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2840 MBEDGE_LOOPFILTER sse2, h, 6, 8, 15
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2841
12266
48d6738904a9 Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents: 12241
diff changeset
2842 %define SPLATB_REG SPLATB_REG_SSSE3
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2843 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2844 %ifdef m8
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2845 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2846 %else
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2847 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15
12210
baf13deed97e Various VP8 x86 deblocking speedups
darkshikari
parents: 12209
diff changeset
2848 %endif
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2849 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2850 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2851
12272
dd90555c98fd Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents: 12268
diff changeset
2852 %define WRITE_8W WRITE_8W_SSE4
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2853 %ifdef m8
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2854 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2855 %else
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2856 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15
12227
d07e6037846d Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents: 12214
diff changeset
2857 %endif
12275
709d5848abf8 Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents: 12274
diff changeset
2858 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15