annotate x86/vp8dsp.asm @ 11991:a6d24fc1deb7 libavcodec

Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
author darkshikari
date Mon, 28 Jun 2010 18:56:24 +0000
parents c3afb5be0d9b
children da388061b227
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
2 ;* VP8 MMXEXT optimizations
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
5 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
6 ;* This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
7 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
8 ;* FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
9 ;* modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
10 ;* License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
11 ;* version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
12 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
13 ;* FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
16 ;* Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
17 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
18 ;* You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
19 ;* License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
21 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
22
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
23 %include "x86inc.asm"
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
24
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
25 SECTION_RODATA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
26
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
27 fourtap_filter_hw_m: times 4 dw -6, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
28 times 4 dw 12, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
29 times 4 dw -9, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
30 times 4 dw 50, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
31 times 4 dw -6, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
32 times 4 dw 93, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
33 times 4 dw -1, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
34 times 4 dw 123, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
35
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
36 sixtap_filter_hw_m: times 4 dw 2, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
37 times 4 dw 108, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
38 times 4 dw -8, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
39 times 4 dw 3, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
40 times 4 dw 77, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
41 times 4 dw -16, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
42 times 4 dw 1, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
43 times 4 dw 36, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
44 times 4 dw -11, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
45
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
46 fourtap_filter_hb_m: times 8 db -6, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
47 times 8 db 123, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
48 times 8 db -9, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
49 times 8 db 93, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
50 times 8 db -6, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
51 times 8 db 50, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
52 times 8 db -1, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
53 times 8 db 12, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
54
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
55 sixtap_filter_hb_m: times 8 db 2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
56 times 8 db -11, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
57 times 8 db 36, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
58 times 8 db 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
59 times 8 db -16, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
60 times 8 db 77, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
61 times 8 db 1, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
62 times 8 db -8, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
63 times 8 db 108, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
64
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
65 fourtap_filter_v_m: times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
66 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
67 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
68 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
69 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
70 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
71 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
72 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
73 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
74 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
75 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
76 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
77 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
78 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
79 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
80 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
81
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
82 sixtap_filter_v_m: times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
83 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
84 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
85 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
86 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
87 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
88 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
89 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
90 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
91 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
92 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
93 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
94 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
95 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
96 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
97 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
98 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
99 times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
100
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
101 bilinear_filter_vw_m: times 8 dw 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
102 times 8 dw 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
103 times 8 dw 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
104 times 8 dw 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
105 times 8 dw 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
106 times 8 dw 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
107 times 8 dw 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
108
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
109 bilinear_filter_vb_m: times 8 db 7, 1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
110 times 8 db 6, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
111 times 8 db 5, 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
112 times 8 db 4, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
113 times 8 db 3, 5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
114 times 8 db 2, 6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
115 times 8 db 1, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
116
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
117 %ifdef PIC
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
118 %define fourtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
119 %define sixtap_filter_hw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
120 %define fourtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
121 %define sixtap_filter_hb r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
122 %define fourtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
123 %define sixtap_filter_v r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
124 %define bilinear_filter_vw r11
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
125 %define bilinear_filter_vb r11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
126 %else
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
127 %define fourtap_filter_hw fourtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
128 %define sixtap_filter_hw sixtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
129 %define fourtap_filter_hb fourtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
130 %define sixtap_filter_hb sixtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
131 %define fourtap_filter_v fourtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
132 %define sixtap_filter_v sixtap_filter_v_m
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
133 %define bilinear_filter_vw bilinear_filter_vw_m
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
134 %define bilinear_filter_vb bilinear_filter_vb_m
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
135 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
136
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
137 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
138 filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
139
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
140 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
141 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
142 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
143
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
144 cextern pw_4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
145 cextern pw_64
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
146
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
147 SECTION .text
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
148
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
149 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
150 ; subpel MC functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
151 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
152 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
153 ; uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
154 ; int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
155 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
156
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
157 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
158 cglobal put_vp8_epel4_h4_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
159 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
160 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
161 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
162 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
163 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
164 movq mm5, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
165 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
166 pxor mm6, mm6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
167
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
168 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
169 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
170
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
171 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
172 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
173 punpcklbw mm1, mm6 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
174 pshufw mm0, mm2, 9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
175 punpcklbw mm0, mm6 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
176 pshufw mm3, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
177 pshufw mm1, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
178 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
179 movq mm0, mm1 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
180 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
181 paddd mm3, mm1 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
182
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
183 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
184 punpckhbw mm2, mm6 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
185 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
186 pshufw mm1, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
187 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
188 paddd mm0, mm1 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
189
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
190 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
191 packssdw mm3, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
192 paddsw mm3, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
193 psraw mm3, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
194 packuswb mm3, mm6 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
195 movd [r0], mm3 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
196
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
197 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
198 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
199 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
200 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
201 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
202 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
203
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
204 ; 4x4 block, H-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
205 cglobal put_vp8_epel4_h6_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
206 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
207 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
208 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
209 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
210 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
211 movq mm5, [sixtap_filter_hw+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
212 movq mm6, [sixtap_filter_hw+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
213 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
214 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
215
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
216 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
217 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
218
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
219 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
220 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
221 punpcklbw mm1, mm3 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
222 pshufw mm0, mm2, 0x9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
223 punpckhbw mm2, mm3 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
224 punpcklbw mm0, mm3 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
225 pshufw mm1, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
226 pshufw mm2, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
227 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
228 pshufw mm3, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
229 movq mm0, mm3 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
230 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
231 paddd mm1, mm3 ; add to 1st 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
232 movq mm3, mm2 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
233 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
234 paddd mm1, mm2 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
235
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
236 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
237 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
238 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
239 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
240 paddd mm0, mm3 ; add to 2nd 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
241 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
242 punpcklbw mm2, mm3 ; byte->word FGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
243 pshufw mm2, mm2, 0xE9 ; word GHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
244 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
245 paddd mm0, mm2 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
246
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
247 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
248 packssdw mm1, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
249 paddsw mm1, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
250 psraw mm1, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
251 packuswb mm1, mm3 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
252 movd [r0], mm1 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
253
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
254 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
255 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
256 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
257 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
258 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
259 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
260
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
261 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
262 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
263 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
264 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
265 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
266 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
267 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
268 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
269 mova m6, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
270 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
271
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
272 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
273 movh m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
274 punpcklbw m0, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
275 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
276 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
277 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
278 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
279 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
280 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
281 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
282 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
283 pmaddwd m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
284 pmaddwd m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
285 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
286
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
287 movh m1, [r2+3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
288 punpcklbw m1, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
289 mova m2, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
290 mova m3, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
291 mova m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
292 psrldq m2, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
293 psrldq m3, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
294 psrldq m4, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
295 punpcklwd m1, m2 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
296 punpcklwd m3, m4 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
297 pmaddwd m1, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
298 pmaddwd m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
299 paddd m1, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
300
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
301 packssdw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
302 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
303 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
304 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
305 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
306
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
307 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
308 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
309 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
310 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
311 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
312 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
313
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
314 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
315 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
316 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
317 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
318 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
319 lea r5, [sixtap_filter_hw+r5*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
320 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
321
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
322 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
323 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
324 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
325 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
326 punpcklbw m0, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
327 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
328 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
329 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
330 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
331 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
332 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
333 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
334 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
335 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
336 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
337 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
338 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
339 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
340 pmaddwd m0, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
341 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
342 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
343 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
344 paddd m0, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
345
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
346 psrldq m6, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
347 mova m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
348 punpcklbw m6, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
349 mova m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
350 mova m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
351 mova m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
352 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
353 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
354 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
355 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
356 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
357 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
358 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
359 punpcklwd m6, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
360 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
361 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
362 pmaddwd m6, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
363 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
364 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
365 paddd m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
366 paddd m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
367
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
368 packssdw m0, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
369 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
370 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
371 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
372 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
373
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
374 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
375 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
376 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
377 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
378 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
379 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
380
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
381 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
382 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
383 mova m2, [pw_64]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
384 mova m3, [filter_h4_shuf]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
385 mova m4, [filter_h6_shuf2]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
386 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
387 lea r11, [fourtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
388 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
389 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
390 mova m6, [fourtap_filter_hb+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
391
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
392 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
393 movu m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
394 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
395 pshufb m0, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
396 pshufb m1, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
397 pmaddubsw m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
398 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
399 paddsw m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
400 paddsw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
401 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
402 packuswb m0, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
403 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
404
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
405 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
406 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
407 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
408 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
409 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
410 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
411
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
412 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
413 lea r5d, [r5*3]
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
414 mova m3, [filter_h6_shuf1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
415 mova m4, [filter_h6_shuf2]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
416 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
417 lea r11, [sixtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
418 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
419 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
420 mova m6, [sixtap_filter_hb+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
421 mova m7, [sixtap_filter_hb+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
422
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
423 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
424 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
425 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
426 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
427 pshufb m0, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
428 pshufb m1, m4
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
429 pshufb m2, [filter_h6_shuf3]
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
430 pmaddubsw m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
431 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
432 pmaddubsw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
433 paddsw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
434 paddsw m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
435 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
436 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
437 packuswb m0, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
438 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
439
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
440 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
441 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
442 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
443 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
444 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
445 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
446
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
447 %macro FILTER_V 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
448 ; 4x4 block, V-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
449 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
450 shl r6d, 5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
451 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
452 lea r11, [fourtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
453 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
454 lea r6, [fourtap_filter_v+r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
455 mova m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
456 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
457 mova m5, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
458
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
459 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
460 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
461 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
462 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
463 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
464 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
465 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
466 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
467 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
468
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
469 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
470 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
471 movh m4, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
472 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
473 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
474 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
475 pmullw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
476 paddsw m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
477
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
478 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
479 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
480 pmullw m1, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
481 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
482 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
483 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
484 paddsw m4, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
485 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
486
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
487 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
488 paddsw m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
489 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
490 packuswb m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
491 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
492
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
493 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
494 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
495 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
496 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
497 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
498 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
499
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
500
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
501 ; 4x4 block, V-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
502 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
503 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
504 lea r6, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
505 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
506 lea r11, [sixtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
507 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
508 lea r6, [sixtap_filter_v+r6-96]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
509 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
510
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
511 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
512 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
513 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
514 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
515 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
516 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
517 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
518 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
519 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
520 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
521 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
522 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
523 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
524 punpcklbw m3, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
525 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
526
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
527 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
528 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
529 mova m5, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
530 pmullw m5, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
531 mova m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
532 pmullw m6, [r6+64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
533 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
534
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
535 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
536 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
537 punpcklbw m5, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
538 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
539 paddsw m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
540 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
541 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
542 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
543 paddsw m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
544 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
545 pmullw m3, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
546 paddsw m6, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
547 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
548 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
549 pmullw m5, [r6+80]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
550 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
551
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
552 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
553 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
554 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
555 packuswb m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
556 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
557
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
558 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
559 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
560 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
561 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
562 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
563 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
564 %endmacro
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
565
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
566 INIT_MMX
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
567 FILTER_V mmxext, 4, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
568 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
569 FILTER_V sse2, 8, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
570
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
571 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
572 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
573 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
574 lea r11, [fourtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
575 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
576 mova m5, [fourtap_filter_hb+r6-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
577 mova m6, [fourtap_filter_hb+r6]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
578 mova m7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
579
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
580 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
581 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
582 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
583 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
584 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
585 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
586
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
587 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
588 movh m3, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
589 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
590 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
591 punpcklbw m4, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
592 punpcklbw m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
593 pmaddubsw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
594 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
595 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
596 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
597 paddsw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
598 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
599 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
600 packuswb m4, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
601 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
602
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
603 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
604 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
605 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
606 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
607 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
608 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
609
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
610 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
611 lea r6d, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
612 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
613 lea r11, [sixtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
614 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
615 lea r6, [sixtap_filter_hb+r6*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
616
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
617 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
618 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
619 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
620 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
621 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
622 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
623 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
624 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
625 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
626 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
627
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
628 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
629 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
630 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
631 punpcklbw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
632 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
633 punpcklbw m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
634 mova m7, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
635 punpcklbw m7, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
636 pmaddubsw m6, [r6-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
637 pmaddubsw m1, [r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
638 pmaddubsw m7, [r6-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
639 paddsw m6, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
640 paddsw m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
641 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
642 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
643 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
644 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
645 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
646 packuswb m6, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
647 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
648 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
649
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
650 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
651 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
652 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
653 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
654 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
655 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
656
11991
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
657 %macro FILTER_BILINEAR 3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
658 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
659 mov r5d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
660 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
661 sub r5d, r6d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
662 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
663 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
664 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
665 pxor m6, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
666 mova m4, [bilinear_filter_vw+r5d-16]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
667 mova m5, [bilinear_filter_vw+r6d-16]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
668 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
669 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
670 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
671 movh m3, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
672 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
673 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
674 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
675 mova m2, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
676 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
677 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
678 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
679 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
680 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
681 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
682 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
683 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
684 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
685 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
686 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
687 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
688 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
689 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
690 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
691 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
692 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
693 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
694 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
695 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
696
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
697 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
698 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
699 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
700 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
701 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
702
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
703 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
704 mov r6d, 8*16
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
705 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
706 sub r6d, r5d
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
707 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
708 lea r11, [bilinear_filter_vw_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
709 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
710 pxor m6, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
711 mova m4, [bilinear_filter_vw+r6d-16]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
712 mova m5, [bilinear_filter_vw+r5d-16]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
713 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
714 movh m0, [r2+r3*0+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
715 movh m1, [r2+r3*0+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
716 movh m2, [r2+r3*1+0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
717 movh m3, [r2+r3*1+1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
718 punpcklbw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
719 punpcklbw m1, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
720 punpcklbw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
721 punpcklbw m3, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
722 pmullw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
723 pmullw m1, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
724 pmullw m2, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
725 pmullw m3, m5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
726 paddsw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
727 paddsw m2, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
728 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
729 psraw m2, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
730 pavgw m0, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
731 pavgw m2, m6
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
732 %ifidn %1, mmxext
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
733 packuswb m0, m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
734 packuswb m2, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
735 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
736 movh [r0+r1*1], m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
737 %else
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
738 packuswb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
739 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
740 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
741 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
742
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
743 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
744 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
745 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
746 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
747 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
748 %endmacro
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
749
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
750 INIT_MMX
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
751 FILTER_BILINEAR mmxext, 4, 0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
752 INIT_XMM
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
753 FILTER_BILINEAR sse2, 8, 7
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
754
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
755 cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
756 shl r6d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
757 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
758 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
759 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
760 pxor m4, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
761 mova m3, [bilinear_filter_vb+r6d-16]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
762 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
763 movh m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
764 movh m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
765 movh m2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
766 punpcklbw m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
767 punpcklbw m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
768 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
769 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
770 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
771 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
772 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
773 pavgw m1, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
774 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
775 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
776 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
777
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
778 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
779 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
780 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
781 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
782 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
783
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
784 cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
785 shl r5d, 4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
786 %ifdef PIC
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
787 lea r11, [bilinear_filter_vb_m]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
788 %endif
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
789 pxor m4, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
790 mova m2, [filter_h2_shuf]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
791 mova m3, [bilinear_filter_vb+r5d-16]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
792 .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
793 movu m0, [r2+r3*0]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
794 movu m1, [r2+r3*1]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
795 pshufb m0, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
796 pshufb m1, m2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
797 pmaddubsw m0, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
798 pmaddubsw m1, m3
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
799 psraw m0, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
800 psraw m1, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
801 pavgw m0, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
802 pavgw m1, m4
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
803 packuswb m0, m1
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
804 movh [r0+r1*0], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
805 movhps [r0+r1*1], m0
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
806
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
807 lea r0, [r0+r1*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
808 lea r2, [r2+r3*2]
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
809 sub r4, 2
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
810 jg .nextrow
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
811 REP_RET
a6d24fc1deb7 Add MMX, SSE2, SSSE3 asm for VP8 bilinear MC
darkshikari
parents: 11975
diff changeset
812
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
813 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
814 ; IDCT functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
815 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
816 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
817 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
818
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
819 cglobal vp8_idct_dc_add_mmx, 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
820 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
821 movd mm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
822
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
823 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
824 paddw mm0, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
825 pxor mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
826 psraw mm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
827 psubw mm1, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
828 packuswb mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
829 packuswb mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
830 punpcklbw mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
831 punpcklbw mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
832 punpcklwd mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
833 punpcklwd mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
834
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
835 ; add DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
836 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
837 movd mm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
838 movd mm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
839 movd mm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
840 movd mm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
841 paddusb mm2, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
842 paddusb mm3, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
843 paddusb mm4, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
844 paddusb mm5, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
845 psubusb mm2, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
846 psubusb mm3, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
847 psubusb mm4, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
848 psubusb mm5, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
849 movd [r0], mm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
850 movd [r0+r2], mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
851 movd [r1], mm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
852 movd [r1+r2], mm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
853 RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
854
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
855 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
856 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
857 movd xmm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
858 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
859 pxor xmm1, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
860 movq xmm2, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
861
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
862 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
863 paddw xmm0, xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
864 movd xmm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
865 movd xmm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
866 movd xmm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
867 movd xmm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
868 psraw xmm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
869 pshuflw xmm0, xmm0, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
870 punpcklqdq xmm0, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
871 punpckldq xmm2, xmm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
872 punpckldq xmm4, xmm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
873 punpcklbw xmm2, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
874 punpcklbw xmm4, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
875 paddw xmm2, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
876 paddw xmm4, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
877 packuswb xmm2, xmm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
878 movd [r0], xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
879 pextrd [r0+r2], xmm2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
880 pextrd [r1], xmm2, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
881 pextrd [r1+r2], xmm2, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
882 RET