annotate x86/vp8dsp.asm @ 12043:f9a0bd0888a4 libavcodec

mpegaudio: call ff_mpegaudiodec_init_mmx() only from float decoder The mmx code is floating-point only, and this function does not know from which decoder it is called. Without this change, the integer decoder only "works" because the size of the context struct is smaller in this case, and the mmx init function writes the function pointer outside the allocated context.
author mru
date Thu, 01 Jul 2010 23:21:17 +0000
parents 1b11083f4bb4
children b8f80fe02861
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
2 ;* VP8 MMXEXT optimizations
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
5 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
6 ;* This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
7 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
8 ;* FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
9 ;* modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
10 ;* License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
11 ;* version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
12 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
13 ;* FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
16 ;* Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
17 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
18 ;* You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
19 ;* License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
21 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
22
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
23 %include "x86inc.asm"
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
24 %include "x86util.asm"
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
25
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
26 SECTION_RODATA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
27
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
28 fourtap_filter_hw_m: times 4 dw -6, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
29 times 4 dw 12, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
30 times 4 dw -9, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
31 times 4 dw 50, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
32 times 4 dw -6, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
33 times 4 dw 93, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
34 times 4 dw -1, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
35 times 4 dw 123, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
37 sixtap_filter_hw_m: times 4 dw 2, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
38 times 4 dw 108, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
39 times 4 dw -8, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
40 times 4 dw 3, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
41 times 4 dw 77, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
42 times 4 dw -16, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
43 times 4 dw 1, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
44 times 4 dw 36, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
45 times 4 dw -11, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
46
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
47 fourtap_filter_hb_m: times 8 db -6, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
48 times 8 db 123, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
49 times 8 db -9, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
50 times 8 db 93, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
51 times 8 db -6, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
52 times 8 db 50, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
53 times 8 db -1, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
54 times 8 db 12, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
55
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
56 sixtap_filter_hb_m: times 8 db 2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
57 times 8 db -11, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
58 times 8 db 36, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
59 times 8 db 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
60 times 8 db -16, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
61 times 8 db 77, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
62 times 8 db 1, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
63 times 8 db -8, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
64 times 8 db 108, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
65
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
66 fourtap_filter_v_m: times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
67 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
68 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
69 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
70 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
71 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
72 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
73 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
74 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
75 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
76 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
77 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
78 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
79 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
80 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
81 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
82
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
83 sixtap_filter_v_m: times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
84 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
85 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
86 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
87 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
88 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
89 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
90 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
91 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
92 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
93 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
94 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
95 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
96 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
97 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
98 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
99 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
100 times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
101
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
102 bilinear_filter_vw_m: times 8 dw 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
103 times 8 dw 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
104 times 8 dw 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
105 times 8 dw 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
106 times 8 dw 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
107 times 8 dw 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
108 times 8 dw 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
109
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
110 bilinear_filter_vb_m: times 8 db 7, 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
111 times 8 db 6, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
112 times 8 db 5, 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
113 times 8 db 4, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
114 times 8 db 3, 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
115 times 8 db 2, 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
116 times 8 db 1, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
117
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
118 %ifdef PIC
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
119 %define fourtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
120 %define sixtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
121 %define fourtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
122 %define sixtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
123 %define fourtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
124 %define sixtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
125 %define bilinear_filter_vw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
126 %define bilinear_filter_vb r11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
127 %else
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
128 %define fourtap_filter_hw fourtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
129 %define sixtap_filter_hw sixtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
130 %define fourtap_filter_hb fourtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
131 %define sixtap_filter_hb sixtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
132 %define fourtap_filter_v fourtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
133 %define sixtap_filter_v sixtap_filter_v_m
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
134 %define bilinear_filter_vw bilinear_filter_vw_m
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
135 %define bilinear_filter_vb bilinear_filter_vb_m
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
136 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
137
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
139 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
140
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
144
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
145 pw_20091: times 4 dw 20091
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
146 pw_17734: times 4 dw 17734
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
147
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
148 cextern pw_3
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
149 cextern pw_4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
150 cextern pw_64
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
151
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
152 SECTION .text
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
153
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
154 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
155 ; subpel MC functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
156 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
157 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
158 ; uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
159 ; int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
160 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
161
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
162 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
163 cglobal put_vp8_epel4_h4_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
164 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
165 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
166 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
167 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
168 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
169 movq mm5, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
170 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
171 pxor mm6, mm6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
172
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
173 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
174 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
175
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
176 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
177 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
178 punpcklbw mm1, mm6 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
179 pshufw mm0, mm2, 9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
180 punpcklbw mm0, mm6 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
181 pshufw mm3, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
182 pshufw mm1, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
183 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
184 movq mm0, mm1 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
185 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
186 paddd mm3, mm1 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
187
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
188 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
189 punpckhbw mm2, mm6 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
190 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
191 pshufw mm1, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
192 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
193 paddd mm0, mm1 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
194
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
195 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
196 packssdw mm3, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
197 paddsw mm3, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
198 psraw mm3, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
199 packuswb mm3, mm6 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
200 movd [r0], mm3 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
201
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
202 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
203 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
204 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
205 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
206 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
207 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
208
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
209 ; 4x4 block, H-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
210 cglobal put_vp8_epel4_h6_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
211 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
212 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
213 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
214 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
215 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
216 movq mm5, [sixtap_filter_hw+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
217 movq mm6, [sixtap_filter_hw+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
218 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
219 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
220
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
221 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
222 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
223
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
224 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
225 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
226 punpcklbw mm1, mm3 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
227 pshufw mm0, mm2, 0x9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
228 punpckhbw mm2, mm3 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
229 punpcklbw mm0, mm3 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
230 pshufw mm1, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
231 pshufw mm2, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
232 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
233 pshufw mm3, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
234 movq mm0, mm3 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
235 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
236 paddd mm1, mm3 ; add to 1st 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
237 movq mm3, mm2 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
238 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
239 paddd mm1, mm2 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
240
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
241 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
242 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
243 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
244 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
245 paddd mm0, mm3 ; add to 2nd 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
246 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
247 punpcklbw mm2, mm3 ; byte->word FGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
248 pshufw mm2, mm2, 0xE9 ; word GHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
249 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
250 paddd mm0, mm2 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
251
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
252 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
253 packssdw mm1, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
254 paddsw mm1, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
255 psraw mm1, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
256 packuswb mm1, mm3 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
257 movd [r0], mm1 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
258
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
259 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
260 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
261 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
262 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
263 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
264 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
265
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
266 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
267 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
268 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
269 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
270 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
271 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
272 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
273 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
274 mova m6, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
275 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
276
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
277 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
278 movh m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
279 punpcklbw m0, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
280 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
281 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
282 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
283 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
284 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
285 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
286 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
287 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
288 pmaddwd m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
289 pmaddwd m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
290 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
291
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
292 movh m1, [r2+3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
293 punpcklbw m1, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
294 mova m2, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
295 mova m3, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
296 mova m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
297 psrldq m2, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
298 psrldq m3, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
299 psrldq m4, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
300 punpcklwd m1, m2 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
301 punpcklwd m3, m4 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
302 pmaddwd m1, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
303 pmaddwd m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
304 paddd m1, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
305
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
306 packssdw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
307 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
308 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
309 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
310 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
311
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
312 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
313 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
314 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
315 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
316 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
317 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
318
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
319 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
320 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
321 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
322 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
323 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
324 lea r5, [sixtap_filter_hw+r5*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
325 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
326
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
327 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
328 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
329 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
330 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
331 punpcklbw m0, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
332 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
333 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
334 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
335 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
336 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
337 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
338 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
339 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
340 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
341 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
342 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
343 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
344 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
345 pmaddwd m0, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
346 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
347 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
348 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
349 paddd m0, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
350
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
351 psrldq m6, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
352 mova m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
353 punpcklbw m6, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
354 mova m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
355 mova m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
356 mova m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
357 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
358 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
359 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
360 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
361 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
362 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
363 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
364 punpcklwd m6, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
365 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
366 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
367 pmaddwd m6, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
368 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
369 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
370 paddd m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
371 paddd m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
372
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
373 packssdw m0, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
374 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
375 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
376 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
377 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
378
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
379 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
380 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
381 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
382 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
383 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
384 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
385
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
386 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
387 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
388 mova m2, [pw_64]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
389 mova m3, [filter_h4_shuf]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
390 mova m4, [filter_h6_shuf2]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
391 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
392 lea r11, [fourtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
393 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
394 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
395 mova m6, [fourtap_filter_hb+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
396
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
397 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
398 movu m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
399 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
400 pshufb m0, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
401 pshufb m1, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
402 pmaddubsw m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
403 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
404 paddsw m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
405 paddsw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
406 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
407 packuswb m0, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
408 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
409
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
410 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
411 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
412 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
413 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
414 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
415 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
416
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
417 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
418 lea r5d, [r5*3]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
419 mova m3, [filter_h6_shuf1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
420 mova m4, [filter_h6_shuf2]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
421 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
422 lea r11, [sixtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
423 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
424 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
425 mova m6, [sixtap_filter_hb+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
426 mova m7, [sixtap_filter_hb+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
427
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
428 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
429 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
430 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
431 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
432 pshufb m0, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
433 pshufb m1, m4
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
434 pshufb m2, [filter_h6_shuf3]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
435 pmaddubsw m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
436 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
437 pmaddubsw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
438 paddsw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
439 paddsw m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
440 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
441 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
442 packuswb m0, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
443 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
444
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
445 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
446 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
447 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
448 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
449 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
450 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
451
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
452 %macro FILTER_V 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
453 ; 4x4 block, V-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
454 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
455 shl r6d, 5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
456 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
457 lea r11, [fourtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
458 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
459 lea r6, [fourtap_filter_v+r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
460 mova m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
461 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
462 mova m5, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
463
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
464 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
465 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
466 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
467 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
468 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
469 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
470 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
471 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
472 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
473
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
474 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
475 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
476 movh m4, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
477 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
478 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
479 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
480 pmullw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
481 paddsw m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
482
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
483 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
484 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
485 pmullw m1, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
486 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
487 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
488 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
489 paddsw m4, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
490 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
491
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
492 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
493 paddsw m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
494 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
495 packuswb m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
496 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
497
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
498 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
499 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
500 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
501 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
502 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
503 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
504
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
505
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
506 ; 4x4 block, V-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
507 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
508 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
509 lea r6, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
510 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
511 lea r11, [sixtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
512 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
513 lea r6, [sixtap_filter_v+r6-96]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
514 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
515
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
516 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
517 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
518 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
519 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
520 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
521 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
522 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
523 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
524 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
525 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
526 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
527 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
528 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
529 punpcklbw m3, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
530 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
531
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
532 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
533 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
534 mova m5, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
535 pmullw m5, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
536 mova m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
537 pmullw m6, [r6+64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
538 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
539
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
540 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
541 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
542 punpcklbw m5, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
543 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
544 paddsw m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
545 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
546 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
547 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
548 paddsw m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
549 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
550 pmullw m3, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
551 paddsw m6, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
552 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
553 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
554 pmullw m5, [r6+80]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
555 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
556
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
557 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
558 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
559 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
560 packuswb m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
561 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
562
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
563 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
564 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
565 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
566 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
567 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
568 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
569 %endmacro
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
570
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
571 INIT_MMX
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
572 FILTER_V mmxext, 4, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
573 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
574 FILTER_V sse2, 8, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
575
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
576 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
577 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
578 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
579 lea r11, [fourtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
580 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
581 mova m5, [fourtap_filter_hb+r6-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
582 mova m6, [fourtap_filter_hb+r6]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
583 mova m7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
584
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
585 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
586 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
587 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
588 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
589 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
590 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
591
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
592 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
593 movh m3, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
594 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
595 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
596 punpcklbw m4, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
597 punpcklbw m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
598 pmaddubsw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
599 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
600 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
601 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
602 paddsw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
603 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
604 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
605 packuswb m4, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
606 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
607
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
608 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
609 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
610 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
611 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
612 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
613 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
614
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
615 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
616 lea r6d, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
617 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
618 lea r11, [sixtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
619 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
620 lea r6, [sixtap_filter_hb+r6*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
621
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
622 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
623 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
624 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
625 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
626 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
627 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
628 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
629 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
630 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
631 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
632
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
633 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
634 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
635 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
636 punpcklbw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
637 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
638 punpcklbw m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
639 mova m7, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
640 punpcklbw m7, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
641 pmaddubsw m6, [r6-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
642 pmaddubsw m1, [r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
643 pmaddubsw m7, [r6-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
644 paddsw m6, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
645 paddsw m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
646 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
647 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
648 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
649 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
650 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
651 packuswb m6, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
652 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
653 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
654
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
655 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
656 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
657 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
658 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
659 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
660 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
661
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
662 %macro FILTER_BILINEAR 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
663 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
664 mov r5d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
665 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
666 sub r5d, r6d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
667 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
668 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
669 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
670 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
671 mova m4, [bilinear_filter_vw+r5-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
672 mova m5, [bilinear_filter_vw+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
673 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
674 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
675 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
676 movh m3, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
677 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
678 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
679 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
680 mova m2, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
681 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
682 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
683 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
684 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
685 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
686 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
687 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
688 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
689 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
690 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
691 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
692 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
693 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
694 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
695 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
696 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
697 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
698 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
699 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
700 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
701
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
702 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
703 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
704 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
705 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
706 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
707
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
708 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
709 mov r6d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
710 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
711 sub r6d, r5d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
712 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
713 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
714 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
715 pxor m6, m6
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
716 mova m4, [bilinear_filter_vw+r6-16]
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
717 mova m5, [bilinear_filter_vw+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
718 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
719 movh m0, [r2+r3*0+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
720 movh m1, [r2+r3*0+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
721 movh m2, [r2+r3*1+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
722 movh m3, [r2+r3*1+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
723 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
724 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
725 punpcklbw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
726 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
727 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
728 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
729 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
730 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
731 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
732 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
733 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
734 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
735 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
736 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
737 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
738 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
739 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
740 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
741 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
742 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
743 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
744 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
745 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
746 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
747
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
748 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
749 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
750 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
751 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
752 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
753 %endmacro
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
754
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
755 INIT_MMX
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
756 FILTER_BILINEAR mmxext, 4, 0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
757 INIT_XMM
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
758 FILTER_BILINEAR sse2, 8, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
759
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
760 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
761 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
762 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
763 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
764 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
765 pxor m4, m4
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
766 mova m3, [bilinear_filter_vb+r6-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
767 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
768 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
769 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
770 movh m2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
771 punpcklbw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
772 punpcklbw m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
773 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
774 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
775 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
776 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
777 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
778 pavgw m1, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
779 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
780 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
781 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
782
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
783 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
784 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
785 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
786 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
787 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
788
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
789 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
790 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
791 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
792 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
793 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
794 pxor m4, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
795 mova m2, [filter_h2_shuf]
12000
a717c1a93036 Fix VP8 bilinear mc on x86_64
darkshikari
parents: 11992
diff changeset
796 mova m3, [bilinear_filter_vb+r5-16]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
797 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
798 movu m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
799 movu m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
800 pshufb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
801 pshufb m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
802 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
803 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
804 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
805 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
806 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
807 pavgw m1, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
808 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
809 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
810 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
811
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
812 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
813 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
814 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
815 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
816 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
817
11992
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
818 cglobal put_vp8_pixels8_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
819 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
820 movq mm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
821 movq mm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
822 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
823 movq [r0+r1*0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
824 movq [r0+r1*1], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
825 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
826 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
827 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
828 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
829
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
830 cglobal put_vp8_pixels16_mmx, 5,5
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
831 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
832 movq mm0, [r2+r3*0+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
833 movq mm1, [r2+r3*0+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
834 movq mm2, [r2+r3*1+0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
835 movq mm3, [r2+r3*1+8]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
836 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
837 movq [r0+r1*0+0], mm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
838 movq [r0+r1*0+8], mm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
839 movq [r0+r1*1+0], mm2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
840 movq [r0+r1*1+8], mm3
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
841 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
842 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
843 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
844 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
845
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
846 cglobal put_vp8_pixels16_sse, 5,5,2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
847 .nextrow:
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
848 movups xmm0, [r2+r3*0]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
849 movups xmm1, [r2+r3*1]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
850 lea r2, [r2+r3*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
851 movaps [r0+r1*0], xmm0
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
852 movaps [r0+r1*1], xmm1
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
853 lea r0, [r0+r1*2]
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
854 sub r4d, 2
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
855 jg .nextrow
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
856 REP_RET
da388061b227 Add x86 asm functions for VP8 put_pixels
darkshikari
parents: 11991
diff changeset
857
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
858 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
859 ; IDCT functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
860 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
861 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
862 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
863
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
864 cglobal vp8_idct_dc_add_mmx, 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
865 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
866 movd mm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
867
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
868 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
869 paddw mm0, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
870 pxor mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
871 psraw mm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
872 psubw mm1, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
873 packuswb mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
874 packuswb mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
875 punpcklbw mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
876 punpcklbw mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
877 punpcklwd mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
878 punpcklwd mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
879
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
880 ; add DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
881 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
882 movd mm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
883 movd mm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
884 movd mm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
885 movd mm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
886 paddusb mm2, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
887 paddusb mm3, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
888 paddusb mm4, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
889 paddusb mm5, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
890 psubusb mm2, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
891 psubusb mm3, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
892 psubusb mm4, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
893 psubusb mm5, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
894 movd [r0], mm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
895 movd [r0+r2], mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
896 movd [r1], mm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
897 movd [r1+r2], mm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
898 RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
899
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
900 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
901 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
902 movd xmm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
903 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
904 pxor xmm1, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
905 movq xmm2, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
906
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
907 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
908 paddw xmm0, xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
909 movd xmm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
910 movd xmm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
911 movd xmm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
912 movd xmm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
913 psraw xmm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
914 pshuflw xmm0, xmm0, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
915 punpcklqdq xmm0, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
916 punpckldq xmm2, xmm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
917 punpckldq xmm4, xmm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
918 punpcklbw xmm2, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
919 punpcklbw xmm4, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
920 paddw xmm2, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
921 paddw xmm4, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
922 packuswb xmm2, xmm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
923 movd [r0], xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
924 pextrd [r0+r2], xmm2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
925 pextrd [r1], xmm2, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
926 pextrd [r1+r2], xmm2, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
927 RET
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
928
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
929 ;-----------------------------------------------------------------------------
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
930 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
931 ;-----------------------------------------------------------------------------
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
932
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
933 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
934 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
935 %macro VP8_MULTIPLY_SUMSUB 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
936 mova %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
937 mova %4, %2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
938 pmulhw %3, m6 ;20091(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
939 pmulhw %4, m6 ;20091(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
940 paddw %3, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
941 paddw %4, %2
12018
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
942 paddw %1, %1
1b11083f4bb4 Use add instead of lshift in mmxext vp8 idct
darkshikari
parents: 12017
diff changeset
943 paddw %2, %2
12013
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
944 pmulhw %1, m7 ;35468(1)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
945 pmulhw %2, m7 ;35468(2)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
946 psubw %1, %4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
947 paddw %2, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
948 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
949
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
950 ; calculate x0=%1+%3; x1=%1-%3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
951 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
952 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
953 ; %5/%6 are temporary registers
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
954 ; we assume m6/m7 have constant words 20091/17734 loaded in them
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
955 %macro VP8_IDCT_TRANSFORM4x4_1D 6
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
956 SUMSUB_BA m%3, m%1, m%5 ;t0, t1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
957 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
958 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
959 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
960 SWAP %4, %1
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
961 SWAP %4, %3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
962 %endmacro
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
963
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
964 INIT_MMX
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
965 cglobal vp8_idct_add_mmx, 3, 3
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
966 ; load block data
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
967 movq m0, [r1]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
968 movq m1, [r1+8]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
969 movq m2, [r1+16]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
970 movq m3, [r1+24]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
971 movq m6, [pw_20091]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
972 movq m7, [pw_17734]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
973
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
974 ; actual IDCT
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
975 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
976 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
977 paddw m0, [pw_4]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
978 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
979 TRANSPOSE4x4W 0, 1, 2, 3, 4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
980
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
981 ; store
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
982 pxor m4, m4
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
983 lea r1, [r0+2*r2]
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
984 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
985 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
986
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
987 RET
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
988
2ae70e2c31a4 MMX idct_add for VP8.
rbultje
parents: 12006
diff changeset
989 ;-----------------------------------------------------------------------------
12006
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
990 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
991 ;-----------------------------------------------------------------------------
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
992
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
993 %macro SCATTER_WHT 1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
994 pextrw r1d, m0, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
995 pextrw r2d, m1, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
996 mov [r0+2*16*0], r1w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
997 mov [r0+2*16*1], r2w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
998 pextrw r1d, m2, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
999 pextrw r2d, m3, %1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1000 mov [r0+2*16*2], r1w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1001 mov [r0+2*16*3], r2w
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1002 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1003
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1004 %macro HADAMARD4_1D 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1005 SUMSUB_BADC m%2, m%1, m%4, m%3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1006 SUMSUB_BADC m%4, m%2, m%3, m%1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1007 SWAP %1, %4, %3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1008 %endmacro
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1009
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1010 INIT_MMX
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1011 cglobal vp8_luma_dc_wht_mmxext, 2,3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1012 movq m0, [r1]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1013 movq m1, [r1+8]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1014 movq m2, [r1+16]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1015 movq m3, [r1+24]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1016 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1017 TRANSPOSE4x4W 0, 1, 2, 3, 4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1018 paddw m0, [pw_3]
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1019 HADAMARD4_1D 0, 1, 2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1020 psraw m0, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1021 psraw m1, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1022 psraw m2, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1023 psraw m3, 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1024 SCATTER_WHT 0
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1025 add r0, 2*16*4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1026 SCATTER_WHT 1
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1027 add r0, 2*16*4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1028 SCATTER_WHT 2
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1029 add r0, 2*16*4
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1030 SCATTER_WHT 3
d584c7373a64 Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents: 12000
diff changeset
1031 RET