annotate x86/vp8dsp.asm @ 12124:11b27985b3d0 libavcodec

Add native GSM 06.10 audio decoder.
author reimar
date Sat, 10 Jul 2010 07:55:06 +0000
parents d780ae746855
children b246b214c2e9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
2 ;* VP8 MMXEXT optimizations
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
5 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
6 ;* This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
7 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
8 ;* FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
9 ;* modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
10 ;* License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
11 ;* version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
12 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
13 ;* FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
16 ;* Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
17 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
18 ;* You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
19 ;* License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
21 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
22
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
23 %include "x86inc.asm"
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
24 %include "x86util.asm"
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
25
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
26 SECTION_RODATA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
27
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
28 fourtap_filter_hw_m: times 4 dw -6, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
29 times 4 dw 12, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
30 times 4 dw -9, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
31 times 4 dw 50, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
32 times 4 dw -6, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
33 times 4 dw 93, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
34 times 4 dw -1, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
35 times 4 dw 123, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
37 sixtap_filter_hw_m: times 4 dw 2, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
38 times 4 dw 108, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
39 times 4 dw -8, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
40 times 4 dw 3, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
41 times 4 dw 77, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
42 times 4 dw -16, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
43 times 4 dw 1, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
44 times 4 dw 36, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
45 times 4 dw -11, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
46
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
47 fourtap_filter_hb_m: times 8 db -6, 123
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
48 times 8 db 12, -1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
49 times 8 db -9, 93
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
50 times 8 db 50, -6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
51 times 8 db -6, 50
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
52 times 8 db 93, -9
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
53 times 8 db -1, 12
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
54 times 8 db 123, -6
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
55
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
56 sixtap_filter_hb_m: times 8 db 2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
57 times 8 db -11, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
58 times 8 db 36, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
59 times 8 db 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
60 times 8 db -16, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
61 times 8 db 77, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
62 times 8 db 1, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
63 times 8 db -8, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
64 times 8 db 108, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
65
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
66 fourtap_filter_v_m: times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
67 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
68 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
69 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
70 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
71 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
72 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
73 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
74 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
75 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
76 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
77 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
78 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
79 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
80 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
81 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
82
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
83 sixtap_filter_v_m: times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
84 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
85 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
86 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
87 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
88 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
89 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
90 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
91 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
92 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
93 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
94 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
95 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
96 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
97 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
98 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
99 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
100 times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
101
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
102 bilinear_filter_vw_m: times 8 dw 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
103 times 8 dw 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
104 times 8 dw 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
105 times 8 dw 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
106 times 8 dw 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
107 times 8 dw 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
108 times 8 dw 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
109
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
110 bilinear_filter_vb_m: times 8 db 7, 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
111 times 8 db 6, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
112 times 8 db 5, 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
113 times 8 db 4, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
114 times 8 db 3, 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
115 times 8 db 2, 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
116 times 8 db 1, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
117
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
118 %ifdef PIC
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
119 %define fourtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
120 %define sixtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
121 %define fourtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
122 %define sixtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
123 %define fourtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
124 %define sixtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
125 %define bilinear_filter_vw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
126 %define bilinear_filter_vb r11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
127 %else
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
128 %define fourtap_filter_hw fourtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
129 %define sixtap_filter_hw sixtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
130 %define fourtap_filter_hb fourtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
131 %define sixtap_filter_hb sixtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
132 %define fourtap_filter_v fourtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
133 %define sixtap_filter_v sixtap_filter_v_m
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
134 %define bilinear_filter_vw bilinear_filter_vw_m
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
135 %define bilinear_filter_vb bilinear_filter_vb_m
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
136 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
137
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
140
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
144
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
145 pw_20091: times 4 dw 20091
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
146 pw_17734: times 4 dw 17734
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
147
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
148 cextern pw_3
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
149 cextern pb_3
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
150 cextern pw_4
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
151 cextern pb_4
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
152 cextern pw_64
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
153 cextern pb_80
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
154 cextern pb_F8
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
155 cextern pb_FE
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
156
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
157 SECTION .text
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
158
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
159 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
160 ; subpel MC functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
161 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
162 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
163 ; uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
164 ; int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
165 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
166
12054
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
167 %macro FILTER_SSSE3 3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
168 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
169 lea r5d, [r5*3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
170 mova m3, [filter_h6_shuf2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
171 mova m4, [filter_h6_shuf3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
172 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
173 lea r11, [sixtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
174 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
175 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
176 mova m6, [sixtap_filter_hb+r5*8-32]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
177 mova m7, [sixtap_filter_hb+r5*8-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
178
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
179 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
180 movu m0, [r2-2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
181 mova m1, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
182 mova m2, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
183 %ifidn %1, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
184 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
185 ; shuffle with a memory operand
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
186 punpcklbw m0, [r2+3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
187 %else
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
188 pshufb m0, [filter_h6_shuf1]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
189 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
190 pshufb m1, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
191 pshufb m2, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
192 pmaddubsw m0, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
193 pmaddubsw m1, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
194 pmaddubsw m2, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
195 paddsw m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
196 paddsw m0, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
197 paddsw m0, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
198 psraw m0, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
199 packuswb m0, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
200 movh [r0], m0 ; store
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
201
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
202 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
203 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
204 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
205 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
206 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
207 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
208
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
209 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
210 shl r5d, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
211 mova m2, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
212 mova m3, [filter_h2_shuf]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
213 mova m4, [filter_h4_shuf]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
214 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
215 lea r11, [fourtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
216 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
217 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
218 mova m6, [fourtap_filter_hb+r5]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
219
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
220 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
221 movu m0, [r2-1]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
222 mova m1, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
223 pshufb m0, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
224 pshufb m1, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
225 pmaddubsw m0, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
226 pmaddubsw m1, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
227 paddsw m0, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
228 paddsw m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
229 psraw m0, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
230 packuswb m0, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
231 movh [r0], m0 ; store
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
232
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
233 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
234 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
235 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
236 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
237 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
238 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
239
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
240 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
241 shl r6d, 4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
242 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
243 lea r11, [fourtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
244 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
245 mova m5, [fourtap_filter_hb+r6-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
246 mova m6, [fourtap_filter_hb+r6]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
247 mova m7, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
248
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
249 ; read 3 lines
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
250 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
251 movh m0, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
252 movh m1, [r2+ r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
253 movh m2, [r2+2*r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
254 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
255
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
256 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
257 movh m3, [r2+2*r3] ; read new row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
258 mova m4, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
259 mova m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
260 punpcklbw m4, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
261 mova m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
262 punpcklbw m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
263 pmaddubsw m4, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
264 pmaddubsw m2, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
265 paddsw m4, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
266 mova m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
267 paddsw m4, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
268 psraw m4, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
269 packuswb m4, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
270 movh [r0], m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
271
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
272 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
273 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
274 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
275 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
276 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
277 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
278
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
279 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
280 lea r6d, [r6*3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
281 %ifdef PIC
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
282 lea r11, [sixtap_filter_hb_m]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
283 %endif
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
284 lea r6, [sixtap_filter_hb+r6*8]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
285
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
286 ; read 5 lines
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
287 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
288 sub r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
289 movh m0, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
290 movh m1, [r2+r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
291 movh m2, [r2+r3*2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
292 lea r2, [r2+r3*2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
293 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
294 movh m3, [r2]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
295 movh m4, [r2+r3]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
296
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
297 .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
298 movh m5, [r2+2*r3] ; read new row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
299 mova m6, m0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
300 punpcklbw m6, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
301 mova m0, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
302 punpcklbw m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
303 mova m7, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
304 punpcklbw m7, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
305 pmaddubsw m6, [r6-48]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
306 pmaddubsw m1, [r6-32]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
307 pmaddubsw m7, [r6-16]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
308 paddsw m6, m1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
309 paddsw m6, m7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
310 mova m1, m2
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
311 paddsw m6, [pw_64]
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
312 mova m2, m3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
313 psraw m6, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
314 mova m3, m4
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
315 packuswb m6, m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
316 mova m4, m5
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
317 movh [r0], m6
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
318
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
319 ; go to next line
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
320 add r0, r1
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
321 add r2, r3
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
322 dec r4 ; next row
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
323 jg .nextrow
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
324 REP_RET
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
325 %endmacro
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
326
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
327 INIT_MMX
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
328 FILTER_SSSE3 4, 0, 0
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
329 INIT_XMM
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
330 FILTER_SSSE3 8, 8, 7
b8f80fe02861 SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents: 12018
diff changeset
331
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
332 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
333 cglobal put_vp8_epel4_h4_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
334 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
335 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
336 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
337 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
338 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
339 movq mm5, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
340 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
341 pxor mm6, mm6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
342
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
343 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
344 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
345
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
346 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
347 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
348 punpcklbw mm1, mm6 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
349 pshufw mm0, mm2, 9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
350 punpcklbw mm0, mm6 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
351 pshufw mm3, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
352 pshufw mm1, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
353 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
354 movq mm0, mm1 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
355 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
356 paddd mm3, mm1 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
357
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
358 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
359 punpckhbw mm2, mm6 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
360 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
361 pshufw mm1, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
362 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
363 paddd mm0, mm1 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
364
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
365 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
366 packssdw mm3, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
367 paddsw mm3, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
368 psraw mm3, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
369 packuswb mm3, mm6 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
370 movd [r0], mm3 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
371
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
372 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
373 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
374 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
375 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
376 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
377 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
378
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
379 ; 4x4 block, H-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
380 cglobal put_vp8_epel4_h6_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
381 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
382 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
383 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
384 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
385 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
386 movq mm5, [sixtap_filter_hw+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
387 movq mm6, [sixtap_filter_hw+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
388 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
389 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
390
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
391 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
392 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
393
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
394 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
395 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
396 punpcklbw mm1, mm3 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
397 pshufw mm0, mm2, 0x9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
398 punpckhbw mm2, mm3 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
399 punpcklbw mm0, mm3 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
400 pshufw mm1, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
401 pshufw mm2, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
402 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
403 pshufw mm3, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
404 movq mm0, mm3 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
405 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
406 paddd mm1, mm3 ; add to 1st 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
407 movq mm3, mm2 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
408 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
409 paddd mm1, mm2 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
410
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
411 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
412 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
413 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
414 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
415 paddd mm0, mm3 ; add to 2nd 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
416 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
417 punpcklbw mm2, mm3 ; byte->word FGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
418 pshufw mm2, mm2, 0xE9 ; word GHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
419 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
420 paddd mm0, mm2 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
421
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
422 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
423 packssdw mm1, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
424 paddsw mm1, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
425 psraw mm1, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
426 packuswb mm1, mm3 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
427 movd [r0], mm1 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
428
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
429 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
430 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
431 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
432 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
433 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
434 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
435
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
436 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
437 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
438 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
439 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
440 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
441 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
442 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
443 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
444 mova m6, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
445 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
446
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
447 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
448 movh m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
449 punpcklbw m0, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
450 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
451 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
452 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
453 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
454 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
455 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
456 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
457 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
458 pmaddwd m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
459 pmaddwd m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
460 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
461
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
462 movh m1, [r2+3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
463 punpcklbw m1, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
464 mova m2, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
465 mova m3, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
466 mova m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
467 psrldq m2, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
468 psrldq m3, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
469 psrldq m4, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
470 punpcklwd m1, m2 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
471 punpcklwd m3, m4 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
472 pmaddwd m1, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
473 pmaddwd m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
474 paddd m1, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
475
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
476 packssdw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
477 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
478 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
479 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
480 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
481
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
482 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
483 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
484 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
485 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
486 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
487 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
488
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
489 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
490 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
491 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
492 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
493 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
494 lea r5, [sixtap_filter_hw+r5*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
495 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
496
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
497 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
498 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
499 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
500 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
501 punpcklbw m0, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
502 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
503 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
504 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
505 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
506 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
507 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
508 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
509 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
510 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
511 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
512 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
513 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
514 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
515 pmaddwd m0, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
516 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
517 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
518 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
519 paddd m0, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
520
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
521 psrldq m6, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
522 mova m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
523 punpcklbw m6, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
524 mova m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
525 mova m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
526 mova m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
527 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
528 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
529 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
530 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
531 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
532 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
533 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
534 punpcklwd m6, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
535 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
536 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
537 pmaddwd m6, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
538 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
539 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
540 paddd m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
541 paddd m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
542
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
543 packssdw m0, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
544 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
545 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
546 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
547 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
548
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
549 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
550 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
551 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
552 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
553 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
554 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
555
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
556 %macro FILTER_V 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
557 ; 4x4 block, V-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
558 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
559 shl r6d, 5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
560 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
561 lea r11, [fourtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
562 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
563 lea r6, [fourtap_filter_v+r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
564 mova m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
565 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
566 mova m5, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
567
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
568 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
569 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
570 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
571 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
572 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
573 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
574 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
575 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
576 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
577
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
578 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
579 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
580 movh m4, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
581 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
582 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
583 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
584 pmullw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
585 paddsw m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
586
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
587 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
588 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
589 pmullw m1, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
590 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
591 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
592 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
593 paddsw m4, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
594 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
595
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
596 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
597 paddsw m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
598 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
599 packuswb m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
600 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
601
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
602 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
603 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
604 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
605 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
606 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
607 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
608
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
609
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
610 ; 4x4 block, V-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
611 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
612 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
613 lea r6, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
614 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
615 lea r11, [sixtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
616 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
617 lea r6, [sixtap_filter_v+r6-96]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
618 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
619
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
620 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
621 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
622 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
623 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
624 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
625 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
626 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
627 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
628 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
629 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
630 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
631 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
632 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
633 punpcklbw m3, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
634 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
635
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
636 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
637 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
638 mova m5, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
639 pmullw m5, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
640 mova m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
641 pmullw m6, [r6+64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
642 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
643
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
644 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
645 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
646 punpcklbw m5, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
647 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
648 paddsw m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
649 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
650 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
651 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
652 paddsw m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
653 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
654 pmullw m3, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
655 paddsw m6, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
656 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
657 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
658 pmullw m5, [r6+80]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
659 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
660
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
661 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
662 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
663 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
664 packuswb m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
665 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
666
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
667 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
668 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
669 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
670 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
671 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
672 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
673 %endmacro
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
674
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
675 INIT_MMX
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
676 FILTER_V mmxext, 4, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
677 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
678 FILTER_V sse2, 8, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
679
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
680 %macro FILTER_BILINEAR 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
681 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
682 mov r5d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
683 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
684 sub r5d, r6d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
685 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
686 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
687 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
688 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
689 mova m4, [bilinear_filter_vw+r5-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
690 mova m5, [bilinear_filter_vw+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
691 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
692 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
693 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
694 movh m3, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
695 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
696 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
697 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
698 mova m2, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
699 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
700 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
701 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
702 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
703 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
704 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
705 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
706 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
707 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
708 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
709 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
710 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
711 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
712 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
713 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
714 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
715 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
716 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
717 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
718 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
719
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
720 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
721 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
722 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
723 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
724 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
725
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
726 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
727 mov r6d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
728 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
729 sub r6d, r5d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
730 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
731 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
732 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
733 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
734 mova m4, [bilinear_filter_vw+r6-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
735 mova m5, [bilinear_filter_vw+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
736 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
737 movh m0, [r2+r3*0+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
738 movh m1, [r2+r3*0+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
739 movh m2, [r2+r3*1+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
740 movh m3, [r2+r3*1+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
741 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
742 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
743 punpcklbw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
744 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
745 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
746 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
747 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
748 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
749 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
750 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
751 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
752 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
753 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
754 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
755 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
756 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
757 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
758 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
759 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
760 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
761 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
762 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
763 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
764 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
765
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
766 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
767 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
768 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
769 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
770 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
771 %endmacro
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
772
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
773 INIT_MMX
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
774 FILTER_BILINEAR mmxext, 4, 0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
775 INIT_XMM
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
776 FILTER_BILINEAR sse2, 8, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
777
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
778 %macro FILTER_BILINEAR_SSSE3 1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
779 cglobal put_vp8_bilinear%1_v_ssse3, 7,7
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
780 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
781 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
782 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
783 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
784 pxor m4, m4
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
785 mova m3, [bilinear_filter_vb+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
786 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
787 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
788 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
789 movh m2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
790 punpcklbw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
791 punpcklbw m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
792 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
793 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
794 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
795 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
796 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
797 pavgw m1, m4
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
798 %if mmsize==8
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
799 packuswb m0, m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
800 packuswb m1, m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
801 movh [r0+r1*0], m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
802 movh [r0+r1*1], m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
803 %else
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
804 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
805 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
806 movhps [r0+r1*1], m0
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
807 %endif
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
808
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
809 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
810 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
811 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
812 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
813 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
814
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
815 cglobal put_vp8_bilinear%1_h_ssse3, 7,7
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
816 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
817 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
818 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
819 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
820 pxor m4, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
821 mova m2, [filter_h2_shuf]
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
822 mova m3, [bilinear_filter_vb+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
823 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
824 movu m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
825 movu m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
826 pshufb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
827 pshufb m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
828 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
829 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
830 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
831 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
832 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
833 pavgw m1, m4
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
834 %if mmsize==8
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
835 packuswb m0, m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
836 packuswb m1, m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
837 movh [r0+r1*0], m0
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
838 movh [r0+r1*1], m1
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
839 %else
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
840 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
841 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
842 movhps [r0+r1*1], m0
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
843 %endif
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
844
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
845 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
846 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
847 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
848 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
849 REP_RET
12082
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
850 %endmacro
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
851
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
852 INIT_MMX
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
853 FILTER_BILINEAR_SSSE3 4
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
854 INIT_XMM
8527154f6e81 SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents: 12054
diff changeset
855 FILTER_BILINEAR_SSSE3 8
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
856
11992
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
857 cglobal put_vp8_pixels8_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
858 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
859 movq mm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
860 movq mm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
861 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
862 movq [r0+r1*0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
863 movq [r0+r1*1], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
864 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
865 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
866 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
867 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
868
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
869 cglobal put_vp8_pixels16_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
870 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
871 movq mm0, [r2+r3*0+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
872 movq mm1, [r2+r3*0+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
873 movq mm2, [r2+r3*1+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
874 movq mm3, [r2+r3*1+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
875 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
876 movq [r0+r1*0+0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
877 movq [r0+r1*0+8], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
878 movq [r0+r1*1+0], mm2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
879 movq [r0+r1*1+8], mm3
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
880 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
881 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
882 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
883 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
884
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
885 cglobal put_vp8_pixels16_sse, 5,5,2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
886 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
887 movups xmm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
888 movups xmm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
889 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
890 movaps [r0+r1*0], xmm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
891 movaps [r0+r1*1], xmm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
892 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
893 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
894 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
895 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
896
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
897 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
898 ; IDCT functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
899 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
900 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
901 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
902
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
903 cglobal vp8_idct_dc_add_mmx, 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
904 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
905 movd mm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
906
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
907 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
908 paddw mm0, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
909 pxor mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
910 psraw mm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
911 psubw mm1, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
912 packuswb mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
913 packuswb mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
914 punpcklbw mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
915 punpcklbw mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
916 punpcklwd mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
917 punpcklwd mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
918
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
919 ; add DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
920 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
921 movd mm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
922 movd mm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
923 movd mm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
924 movd mm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
925 paddusb mm2, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
926 paddusb mm3, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
927 paddusb mm4, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
928 paddusb mm5, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
929 psubusb mm2, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
930 psubusb mm3, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
931 psubusb mm4, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
932 psubusb mm5, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
933 movd [r0], mm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
934 movd [r0+r2], mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
935 movd [r1], mm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
936 movd [r1+r2], mm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
937 RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
938
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
939 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
940 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
941 movd xmm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
942 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
943 pxor xmm1, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
944 movq xmm2, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
945
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
946 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
947 paddw xmm0, xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
948 movd xmm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
949 movd xmm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
950 movd xmm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
951 movd xmm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
952 psraw xmm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
953 pshuflw xmm0, xmm0, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
954 punpcklqdq xmm0, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
955 punpckldq xmm2, xmm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
956 punpckldq xmm4, xmm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
957 punpcklbw xmm2, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
958 punpcklbw xmm4, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
959 paddw xmm2, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
960 paddw xmm4, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
961 packuswb xmm2, xmm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
962 movd [r0], xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
963 pextrd [r0+r2], xmm2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
964 pextrd [r1], xmm2, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
965 pextrd [r1+r2], xmm2, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
966 RET
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
967
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
968 ;-----------------------------------------------------------------------------
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
969 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
970 ;-----------------------------------------------------------------------------
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
971
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
972 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
973 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
974 %macro VP8_MULTIPLY_SUMSUB 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
975 mova %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
976 mova %4, %2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
977 pmulhw %3, m6 ;20091(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
978 pmulhw %4, m6 ;20091(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
979 paddw %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
980 paddw %4, %2
12018
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
981 paddw %1, %1
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
982 paddw %2, %2
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
983 pmulhw %1, m7 ;35468(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
984 pmulhw %2, m7 ;35468(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
985 psubw %1, %4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
986 paddw %2, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
987 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
988
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
989 ; calculate x0=%1+%3; x1=%1-%3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
990 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
991 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
992 ; %5/%6 are temporary registers
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
993 ; we assume m6/m7 have constant words 20091/17734 loaded in them
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
994 %macro VP8_IDCT_TRANSFORM4x4_1D 6
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
995 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
996 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
997 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
998 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
999 SWAP %4, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1000 SWAP %4, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1001 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1002
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1003 INIT_MMX
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1004 cglobal vp8_idct_add_mmx, 3, 3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1005 ; load block data
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1006 movq m0, [r1]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1007 movq m1, [r1+8]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1008 movq m2, [r1+16]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1009 movq m3, [r1+24]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1010 movq m6, [pw_20091]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1011 movq m7, [pw_17734]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1012
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1013 ; actual IDCT
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1014 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1015 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1016 paddw m0, [pw_4]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1017 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1018 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1019
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1020 ; store
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1021 pxor m4, m4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1022 lea r1, [r0+2*r2]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1023 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1024 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1025
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1026 RET
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1027
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
1028 ;-----------------------------------------------------------------------------
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1029 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1030 ;-----------------------------------------------------------------------------
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1031
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1032 %macro SCATTER_WHT 1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1033 pextrw r1d, m0, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1034 pextrw r2d, m1, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1035 mov [r0+2*16*0], r1w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1036 mov [r0+2*16*1], r2w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1037 pextrw r1d, m2, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1038 pextrw r2d, m3, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1039 mov [r0+2*16*2], r1w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1040 mov [r0+2*16*3], r2w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1041 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1042
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1043 %macro HADAMARD4_1D 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1044 SUMSUB_BADC m%2, m%1, m%4, m%3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1045 SUMSUB_BADC m%4, m%2, m%3, m%1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1046 SWAP %1, %4, %3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1047 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1048
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1049 INIT_MMX
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1050 cglobal vp8_luma_dc_wht_mmxext, 2,3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1051 movq m0, [r1]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1052 movq m1, [r1+8]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1053 movq m2, [r1+16]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1054 movq m3, [r1+24]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1055 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1056 TRANSPOSE4x4W 0, 1, 2, 3, 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1057 paddw m0, [pw_3]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1058 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1059 psraw m0, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1060 psraw m1, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1061 psraw m2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1062 psraw m3, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1063 SCATTER_WHT 0
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1064 add r0, 2*16*4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1065 SCATTER_WHT 1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1066 add r0, 2*16*4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1067 SCATTER_WHT 2
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1068 add r0, 2*16*4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1069 SCATTER_WHT 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1070 RET
12086
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1071
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1072 ;-----------------------------------------------------------------------------
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1073 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1074 ;-----------------------------------------------------------------------------
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1075
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1076 ; macro called with 7 mm register indexes as argument, and 4 regular registers
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1077 ;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1078 ; first 4 mm registers will carry the transposed pixel data
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1079 ; the other three are scratchspace (one would be sufficient, but this allows
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1080 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1081 ;
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1082 ; first two regular registers are buf+4*stride and buf+5*stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1083 ; third is -stride, fourth is +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1084 %macro READ_8x4_INTERLEAVED 11
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1085 ; interleave 8 (A-H) rows of 4 pixels each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1086 movd m%1, [%8+%10*4] ; A0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1087 movd m%5, [%9+%10*4] ; B0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1088 movd m%2, [%8+%10*2] ; C0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1089 movd m%6, [%8+%10] ; D0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1090 movd m%3, [%8] ; E0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1091 movd m%7, [%9] ; F0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1092 movd m%4, [%9+%11] ; G0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1093 punpcklbw m%1, m%5 ; A/B interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1094 movd m%5, [%9+%11*2] ; H0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1095 punpcklbw m%2, m%6 ; C/D interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1096 punpcklbw m%3, m%7 ; E/F interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1097 punpcklbw m%4, m%5 ; G/H interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1098 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1099
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1100 ; macro called with 7 mm register indexes as argument, and 5 regular registers
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1101 ; first 11 mean the same as READ_8x4_TRANSPOSED above
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1102 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1103 ; will be set to second regular register + 8*stride at the end
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1104 %macro READ_16x4_INTERLEAVED 12
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1105 ; transpose 16 (A-P) rows of 4 pixels each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1106 lea %12, [r0+8*r2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1107
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1108 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1109 movd m%1, [%8+%10*4] ; A0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1110 movd m%3, [%12+%10*4] ; I0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1111 movd m%2, [%8+%10*2] ; C0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1112 movd m%4, [%12+%10*2] ; K0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1113 movd m%6, [%8+%10] ; D0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1114 movd m%5, [%12+%10] ; L0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1115 movd m%7, [%12] ; M0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1116 add %12, %11
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1117 punpcklbw m%1, m%3 ; A/I
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1118 movd m%3, [%8] ; E0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1119 punpcklbw m%2, m%4 ; C/K
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1120 punpcklbw m%6, m%5 ; D/L
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1121 punpcklbw m%3, m%7 ; E/M
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1122 punpcklbw m%2, m%6 ; C/D/K/L interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1123
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1124 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1125 movd m%5, [%9+%10*4] ; B0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1126 movd m%4, [%12+%10*4] ; J0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1127 movd m%7, [%9] ; F0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1128 movd m%6, [%12] ; N0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1129 punpcklbw m%5, m%4 ; B/J
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1130 punpcklbw m%7, m%6 ; F/N
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1131 punpcklbw m%1, m%5 ; A/B/I/J interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1132 punpcklbw m%3, m%7 ; E/F/M/N interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1133 movd m%4, [%9+%11] ; G0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1134 movd m%6, [%12+%11] ; O0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1135 movd m%5, [%9+%11*2] ; H0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1136 movd m%7, [%12+%11*2] ; P0-3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1137 punpcklbw m%4, m%6 ; G/O
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1138 punpcklbw m%5, m%7 ; H/P
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1139 punpcklbw m%4, m%5 ; G/H/O/P interleaved
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1140 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1141
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1142 ; write 4 mm registers of 2 dwords each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1143 ; first four arguments are mm register indexes containing source data
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1144 ; last four are registers containing buf+4*stride, buf+5*stride,
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1145 ; -stride and +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1146 %macro WRITE_4x2D 8
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1147 ; write out (2 dwords per register)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1148 movd [%5+%7*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1149 movd [%5+%7*2], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1150 movd [%5], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1151 movd [%6+%8], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1152 punpckhdq m%1, m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1153 punpckhdq m%2, m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1154 punpckhdq m%3, m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1155 punpckhdq m%4, m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1156 movd [%6+%7*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1157 movd [%5+%7], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1158 movd [%6], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1159 movd [%6+%8*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1160 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1161
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1162 ; write 4 xmm registers of 4 dwords each
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1163 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1164 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1165 ; we add 1*stride to the third regular registry in the process
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1166 %macro WRITE_4x4D 9
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1167 ; write out (4 dwords per register), start with dwords zero
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1168 movd [%5+%8*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1169 movd [%5], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1170 movd [%5+%9*4], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1171 movd [%5+%9*8], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1172
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1173 ; store dwords 1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1174 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1175 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1176 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1177 psrldq m%4, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1178 movd [%6+%8*4], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1179 movd [%6], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1180 movd [%6+%9*4], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1181 movd [%6+%9*8], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1182
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1183 ; write dwords 2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1184 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1185 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1186 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1187 psrldq m%4, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1188 movd [%5+%8*2], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1189 movd [%6+%9], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1190 movd [%7+%8*2], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1191 movd [%7+%9*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1192 add %7, %9
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1193
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1194 ; store dwords 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1195 psrldq m%1, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1196 psrldq m%2, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1197 psrldq m%3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1198 psrldq m%4, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1199 movd [%5+%8], m%1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1200 movd [%6+%9*2], m%2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1201 movd [%7+%8*2], m%3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1202 movd [%7+%9*2], m%4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1203 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1204
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1205 %macro SIMPLE_LOOPFILTER 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1206 cglobal vp8_%2_loop_filter_simple_%1, 3, %3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1207 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1208 mov r5, rsp ; backup stack pointer
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1209 and rsp, ~(mmsize-1) ; align stack
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1210 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1211 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1212 mov r3, 2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1213 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1214
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1215 ; splat register with "flim"
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1216 movd m7, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1217 punpcklbw m7, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1218 %if mmsize == 16 ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1219 punpcklwd m7, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1220 pshufd m7, m7, 0x0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1221 %elifidn %1, mmx
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1222 punpcklwd m7, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1223 punpckldq m7, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1224 %else ; mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1225 pshufw m7, m7, 0x0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1226 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1227
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1228 ; set up indexes to address 4 rows
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1229 mov r2, r1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1230 neg r1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1231 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1232 lea r0, [r0+4*r2-2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1233 sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1234 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1235
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1236 %if mmsize == 8 ; mmx / mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1237 .next8px
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1238 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1239 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1240 ; read 4 half/full rows of pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1241 mova m0, [r0+r1*2] ; p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1242 mova m1, [r0+r1] ; p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1243 mova m2, [r0] ; q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1244 mova m3, [r0+r2] ; q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1245 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1246 lea r4, [r0+r2]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1247
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1248 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1249 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1250 %else ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1251 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1252 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1253 TRANSPOSE4x4W 0, 1, 2, 3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1254
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1255 mova [rsp], m0 ; store p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1256 mova [rsp+mmsize], m3 ; store q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1257 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1258
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1259 ; simple_limit
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1260 mova m5, m2 ; m5=backup of q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1261 mova m6, m1 ; m6=backup of p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1262 psubusb m1, m2 ; p0-q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1263 psubusb m2, m6 ; q0-p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1264 por m1, m2 ; FFABS(p0-q0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1265 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1266
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1267 mova m4, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1268 mova m2, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1269 psubusb m3, m0 ; q1-p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1270 psubusb m0, m4 ; p1-q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1271 por m3, m0 ; FFABS(p1-q1)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1272 mova m0, [pb_80]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1273 pxor m2, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1274 pxor m4, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1275 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1276 pand m3, [pb_FE]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1277 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1278 paddusb m3, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1279 psubusb m3, m7
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1280 pxor m1, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1281 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1282
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1283 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1284 mova m4, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1285 pxor m5, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1286 pxor m0, m6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1287 psubsb m5, m0 ; q0-p0 (signed)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1288 paddsb m2, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1289 paddsb m2, m5
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1290 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1291 pand m2, m3 ; apply filter mask (m3)
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1292
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1293 mova m3, [pb_F8]
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1294 mova m1, m2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1295 paddsb m2, [pb_4] ; f1<<3=a+4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1296 paddsb m1, [pb_3] ; f2<<3=a+3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1297 pand m2, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1298 pand m1, m3 ; cache f2<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1299
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1300 pxor m0, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1301 pxor m3, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1302 pcmpgtb m0, m2 ; which values are <0?
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1303 psubb m3, m2 ; -f1<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1304 psrlq m2, 3 ; +f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1305 psrlq m3, 3 ; -f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1306 pand m3, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1307 pandn m0, m2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1308 psubusb m4, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1309 paddusb m4, m3 ; q0-f1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1310
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1311 pxor m0, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1312 pxor m3, m3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1313 pcmpgtb m0, m1 ; which values are <0?
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1314 psubb m3, m1 ; -f2<<3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1315 psrlq m1, 3 ; +f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1316 psrlq m3, 3 ; -f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1317 pand m3, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1318 pandn m0, m1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1319 paddusb m6, m0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1320 psubusb m6, m3 ; p0+f2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1321
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1322 ; store
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1323 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1324 mova [r0], m4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1325 mova [r0+r1], m6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1326 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1327 mova m0, [rsp] ; p1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1328 SWAP 2, 4 ; p0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1329 SWAP 1, 6 ; q0
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1330 mova m3, [rsp+mmsize] ; q1
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1331
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1332 TRANSPOSE4x4B 0, 1, 2, 3, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1333 %if mmsize == 16 ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1334 add r3, r1 ; change from r4*8*stride to r0+8*stride
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1335 WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1336 %else ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1337 WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1338 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1339 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1340
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1341 %if mmsize == 8 ; mmx/mmxext
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1342 ; next 8 pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1343 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1344 add r0, 8 ; advance 8 cols = pixels
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1345 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1346 lea r0, [r0+r2*8] ; advance 8 rows = lines
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1347 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1348 dec r3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1349 jg .next8px
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1350 %ifidn %2, v
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1351 REP_RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1352 %else ; h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1353 mov rsp, r5 ; restore stack pointer
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1354 RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1355 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1356 %else ; sse2
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1357 %ifidn %2, h
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1358 mov rsp, r5 ; restore stack pointer
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1359 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1360 RET
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1361 %endif
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1362 %endmacro
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1363
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1364 INIT_MMX
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1365 SIMPLE_LOOPFILTER mmx, v, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1366 SIMPLE_LOOPFILTER mmx, h, 6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1367 SIMPLE_LOOPFILTER mmxext, v, 4
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1368 SIMPLE_LOOPFILTER mmxext, h, 6
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1369 INIT_XMM
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1370 SIMPLE_LOOPFILTER sse2, v, 3
d780ae746855 Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents: 12082
diff changeset
1371 SIMPLE_LOOPFILTER sse2, h, 6