annotate x86/vp8dsp.asm @ 11975:c3afb5be0d9b libavcodec

First shot at VP8 optimizations: - MMXEXT, SSE2 and SSSE3 MC functions - MMX and SSE4 IDCT dc_add functions Patch by Jason Garrett-Glaser <darkshikari gmail com> and myself.
author rbultje
date Sun, 27 Jun 2010 02:01:45 +0000
parents
children a6d24fc1deb7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11975
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
1 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
2 ;* VP8 MMXEXT optimizations
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
5 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
6 ;* This file is part of FFmpeg.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
7 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
8 ;* FFmpeg is free software; you can redistribute it and/or
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
9 ;* modify it under the terms of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
10 ;* License as published by the Free Software Foundation; either
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
11 ;* version 2.1 of the License, or (at your option) any later version.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
12 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
13 ;* FFmpeg is distributed in the hope that it will be useful,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
16 ;* Lesser General Public License for more details.
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
17 ;*
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
18 ;* You should have received a copy of the GNU Lesser General Public
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
19 ;* License along with FFmpeg; if not, write to the Free Software
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
21 ;******************************************************************************
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
22
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
23 %include "x86inc.asm"
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
24
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
25 SECTION_RODATA
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
26
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
27 fourtap_filter_hw_m: times 4 dw -6, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
28 times 4 dw 12, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
29 times 4 dw -9, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
30 times 4 dw 50, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
31 times 4 dw -6, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
32 times 4 dw 93, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
33 times 4 dw -1, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
34 times 4 dw 123, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
35
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
36 sixtap_filter_hw_m: times 4 dw 2, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
37 times 4 dw 108, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
38 times 4 dw -8, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
39 times 4 dw 3, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
40 times 4 dw 77, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
41 times 4 dw -16, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
42 times 4 dw 1, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
43 times 4 dw 36, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
44 times 4 dw -11, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
45
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
46 fourtap_filter_hb_m: times 8 db -6, -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
47 times 8 db 123, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
48 times 8 db -9, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
49 times 8 db 93, 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
50 times 8 db -6, -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
51 times 8 db 50, 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
52 times 8 db -1, -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
53 times 8 db 12, 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
54
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
55 sixtap_filter_hb_m: times 8 db 2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
56 times 8 db -11, 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
57 times 8 db 36, -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
58 times 8 db 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
59 times 8 db -16, 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
60 times 8 db 77, -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
61 times 8 db 1, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
62 times 8 db -8, 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
63 times 8 db 108, -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
64
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
65 fourtap_filter_v_m: times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
66 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
67 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
68 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
69 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
70 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
71 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
72 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
73 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
74 times 8 dw 50
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
75 times 8 dw 93
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
76 times 8 dw -9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
77 times 8 dw -1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
78 times 8 dw 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
79 times 8 dw 123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
80 times 8 dw -6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
81
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
82 sixtap_filter_v_m: times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
83 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
84 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
85 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
86 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
87 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
88 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
89 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
90 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
91 times 8 dw 77
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
92 times 8 dw -16
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
93 times 8 dw 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
94 times 8 dw 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
95 times 8 dw -8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
96 times 8 dw 36
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
97 times 8 dw 108
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
98 times 8 dw -11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
99 times 8 dw 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
100
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
101 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
102 %define fourtap_filter_hw r11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
103 %define sixtap_filter_hw r11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
104 %define fourtap_filter_hb r11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
105 %define sixtap_filter_hb r11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
106 %define fourtap_filter_v r11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
107 %define sixtap_filter_v r11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
108 %else
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
109 %define fourtap_filter_hw fourtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
110 %define sixtap_filter_hw sixtap_filter_hw_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
111 %define fourtap_filter_hb fourtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
112 %define sixtap_filter_hb sixtap_filter_hb_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
113 %define fourtap_filter_v fourtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
114 %define sixtap_filter_v sixtap_filter_v_m
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
115 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
116
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
117 filter_v4_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
118 filter_v4_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
119
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
120 filter_v6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
121 filter_v6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
122 filter_v6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
123
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
124 cextern pw_4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
125 cextern pw_64
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
126
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
127 SECTION .text
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
128
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
129 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
130 ; subpel MC functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
131 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
132 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
133 ; uint8_t *src, int srcstride,
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
134 ; int height, int mx, int my);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
135 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
136
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
137 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
138 cglobal put_vp8_epel4_h4_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
139 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
140 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
141 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
142 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
143 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
144 movq mm5, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
145 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
146 pxor mm6, mm6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
147
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
148 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
149 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
150
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
151 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
152 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
153 punpcklbw mm1, mm6 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
154 pshufw mm0, mm2, 9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
155 punpcklbw mm0, mm6 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
156 pshufw mm3, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
157 pshufw mm1, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
158 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
159 movq mm0, mm1 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
160 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
161 paddd mm3, mm1 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
162
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
163 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
164 punpckhbw mm2, mm6 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
165 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
166 pshufw mm1, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
167 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
168 paddd mm0, mm1 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
169
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
170 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
171 packssdw mm3, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
172 paddsw mm3, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
173 psraw mm3, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
174 packuswb mm3, mm6 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
175 movd [r0], mm3 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
176
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
177 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
178 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
179 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
180 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
181 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
182 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
183
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
184 ; 4x4 block, H-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
185 cglobal put_vp8_epel4_h6_mmxext, 6, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
186 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
187 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
188 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
189 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
190 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
191 movq mm5, [sixtap_filter_hw+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
192 movq mm6, [sixtap_filter_hw+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
193 movq mm7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
194 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
195
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
196 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
197 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
198
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
199 ; first set of 2 pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
200 movq mm2, mm1 ; byte ABCD..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
201 punpcklbw mm1, mm3 ; byte->word ABCD
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
202 pshufw mm0, mm2, 0x9 ; byte CDEF..
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
203 punpckhbw mm2, mm3 ; byte->word EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
204 punpcklbw mm0, mm3 ; byte->word CDEF
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
205 pshufw mm1, mm1, 0x94 ; word ABBC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
206 pshufw mm2, mm2, 0x94 ; word EFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
207 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
208 pshufw mm3, mm0, 0x94 ; word CDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
209 movq mm0, mm3 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
210 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
211 paddd mm1, mm3 ; add to 1st 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
212 movq mm3, mm2 ; backup for second set of pixels
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
213 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
214 paddd mm1, mm2 ; finish 1st 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
215
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
216 ; second set of 2 pixels, use backup of above
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
217 movd mm2, [r2+3] ; byte FGHI (prevent overreads)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
218 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
219 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
220 paddd mm0, mm3 ; add to 2nd 2px cache
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
221 pxor mm3, mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
222 punpcklbw mm2, mm3 ; byte->word FGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
223 pshufw mm2, mm2, 0xE9 ; word GHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
224 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
225 paddd mm0, mm2 ; finish 2nd 2px
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
226
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
227 ; merge two sets of 2 pixels into one set of 4, round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
228 packssdw mm1, mm0 ; merge dword->word (4px)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
229 paddsw mm1, mm7 ; rounding
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
230 psraw mm1, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
231 packuswb mm1, mm3 ; clip and word->bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
232 movd [r0], mm1 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
233
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
234 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
235 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
236 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
237 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
238 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
239 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
240
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
241 ; 4x4 block, H-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
242 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
243 cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
244 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
245 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
246 lea r11, [fourtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
247 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
248 mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
249 mova m6, [fourtap_filter_hw+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
250 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
251
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
252 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
253 movh m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
254 punpcklbw m0, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
255 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
256 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
257 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
258 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
259 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
260 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
261 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
262 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
263 pmaddwd m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
264 pmaddwd m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
265 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
266
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
267 movh m1, [r2+3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
268 punpcklbw m1, m7 ; ABCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
269 mova m2, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
270 mova m3, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
271 mova m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
272 psrldq m2, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
273 psrldq m3, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
274 psrldq m4, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
275 punpcklwd m1, m2 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
276 punpcklwd m3, m4 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
277 pmaddwd m1, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
278 pmaddwd m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
279 paddd m1, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
280
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
281 packssdw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
282 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
283 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
284 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
285 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
286
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
287 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
288 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
289 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
290 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
291 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
292 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
293
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
294 cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
295 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
296 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
297 lea r11, [sixtap_filter_hw_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
298 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
299 lea r5, [sixtap_filter_hw+r5*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
300 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
301
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
302 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
303 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
304 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
305 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
306 punpcklbw m0, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
307 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
308 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
309 mova m3, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
310 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
311 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
312 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
313 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
314 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
315 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
316 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
317 punpcklwd m0, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
318 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
319 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
320 pmaddwd m0, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
321 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
322 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
323 paddd m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
324 paddd m0, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
325
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
326 psrldq m6, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
327 mova m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
328 punpcklbw m6, m7 ; ABCDEFGHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
329 mova m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
330 mova m2, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
331 mova m3, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
332 psrldq m1, 2 ; BCDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
333 psrldq m2, 4 ; CDEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
334 psrldq m3, 6 ; DEFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
335 psrldq m4, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
336 punpcklbw m4, m7 ; EFGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
337 mova m5, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
338 psrldq m5, 2 ; FGH
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
339 punpcklwd m6, m1 ; ABBCCDDE
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
340 punpcklwd m2, m3 ; CDDEEFFG
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
341 punpcklwd m4, m5 ; EFFGGHHI
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
342 pmaddwd m6, [r5-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
343 pmaddwd m2, [r5-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
344 pmaddwd m4, [r5-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
345 paddd m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
346 paddd m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
347
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
348 packssdw m0, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
349 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
350 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
351 packuswb m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
352 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
353
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
354 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
355 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
356 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
357 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
358 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
359 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
360
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
361 cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
362 shl r5d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
363 mova m2, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
364 mova m3, [filter_v4_shuf1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
365 mova m4, [filter_v4_shuf2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
366 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
367 lea r11, [fourtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
368 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
369 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
370 mova m6, [fourtap_filter_hb+r5]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
371
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
372 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
373 movu m0, [r2-1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
374 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
375 pshufb m0, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
376 pshufb m1, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
377 pmaddubsw m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
378 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
379 paddsw m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
380 paddsw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
381 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
382 packuswb m0, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
383 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
384
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
385 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
386 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
387 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
388 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
389 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
390 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
391
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
392 cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
393 lea r5d, [r5*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
394 mova m3, [filter_v6_shuf1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
395 mova m4, [filter_v6_shuf2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
396 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
397 lea r11, [sixtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
398 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
399 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
400 mova m6, [sixtap_filter_hb+r5*8-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
401 mova m7, [sixtap_filter_hb+r5*8-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
402
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
403 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
404 movu m0, [r2-2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
405 mova m1, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
406 mova m2, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
407 pshufb m0, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
408 pshufb m1, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
409 pshufb m2, [filter_v6_shuf3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
410 pmaddubsw m0, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
411 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
412 pmaddubsw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
413 paddsw m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
414 paddsw m0, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
415 paddsw m0, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
416 psraw m0, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
417 packuswb m0, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
418 movh [r0], m0 ; store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
419
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
420 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
421 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
422 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
423 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
424 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
425 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
426
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
427 %macro FILTER_V 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
428 ; 4x4 block, V-only 4-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
429 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
430 shl r6d, 5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
431 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
432 lea r11, [fourtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
433 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
434 lea r6, [fourtap_filter_v+r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
435 mova m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
436 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
437 mova m5, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
438
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
439 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
440 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
441 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
442 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
443 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
444 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
445 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
446 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
447 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
448
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
449 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
450 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
451 movh m4, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
452 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
453 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
454 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
455 pmullw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
456 paddsw m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
457
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
458 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
459 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
460 pmullw m1, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
461 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
462 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
463 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
464 paddsw m4, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
465 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
466
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
467 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
468 paddsw m4, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
469 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
470 packuswb m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
471 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
472
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
473 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
474 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
475 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
476 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
477 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
478 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
479
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
480
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
481 ; 4x4 block, V-only 6-tap filter
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
482 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
483 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
484 lea r6, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
485 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
486 lea r11, [sixtap_filter_v_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
487 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
488 lea r6, [sixtap_filter_v+r6-96]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
489 pxor m7, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
490
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
491 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
492 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
493 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
494 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
495 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
496 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
497 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
498 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
499 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
500 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
501 punpcklbw m0, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
502 punpcklbw m1, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
503 punpcklbw m2, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
504 punpcklbw m3, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
505 punpcklbw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
506
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
507 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
508 ; first calculate negative taps (to prevent losing positive overflows)
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
509 mova m5, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
510 pmullw m5, [r6+16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
511 mova m6, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
512 pmullw m6, [r6+64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
513 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
514
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
515 ; then calculate positive taps
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
516 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
517 punpcklbw m5, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
518 pmullw m0, [r6+0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
519 paddsw m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
520 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
521 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
522 pmullw m2, [r6+32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
523 paddsw m6, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
524 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
525 pmullw m3, [r6+48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
526 paddsw m6, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
527 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
528 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
529 pmullw m5, [r6+80]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
530 paddsw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
531
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
532 ; round/clip/store
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
533 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
534 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
535 packuswb m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
536 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
537
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
538 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
539 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
540 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
541 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
542 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
543 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
544 %endmacro
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
545
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
546 INIT_MMX
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
547 FILTER_V mmxext, 4, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
548 INIT_XMM
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
549 FILTER_V sse2, 8, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
550
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
551 cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
552 shl r6d, 4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
553 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
554 lea r11, [fourtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
555 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
556 mova m5, [fourtap_filter_hb+r6-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
557 mova m6, [fourtap_filter_hb+r6]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
558 mova m7, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
559
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
560 ; read 3 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
561 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
562 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
563 movh m1, [r2+ r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
564 movh m2, [r2+2*r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
565 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
566
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
567 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
568 movh m3, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
569 mova m4, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
570 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
571 punpcklbw m4, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
572 punpcklbw m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
573 pmaddubsw m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
574 pmaddubsw m1, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
575 paddsw m4, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
576 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
577 paddsw m4, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
578 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
579 psraw m4, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
580 packuswb m4, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
581 movh [r0], m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
582
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
583 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
584 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
585 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
586 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
587 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
588 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
589
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
590 cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
591 lea r6d, [r6*3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
592 %ifdef PIC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
593 lea r11, [sixtap_filter_hb_m]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
594 %endif
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
595 lea r6, [sixtap_filter_hb+r6*8]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
596
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
597 ; read 5 lines
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
598 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
599 sub r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
600 movh m0, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
601 movh m1, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
602 movh m2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
603 lea r2, [r2+r3*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
604 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
605 movh m3, [r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
606 movh m4, [r2+r3]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
607
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
608 .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
609 movh m5, [r2+2*r3] ; read new row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
610 mova m6, m0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
611 punpcklbw m6, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
612 mova m0, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
613 punpcklbw m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
614 mova m7, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
615 punpcklbw m7, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
616 pmaddubsw m6, [r6-48]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
617 pmaddubsw m1, [r6-32]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
618 pmaddubsw m7, [r6-16]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
619 paddsw m6, m1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
620 paddsw m6, m7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
621 mova m1, m2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
622 paddsw m6, [pw_64]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
623 mova m2, m3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
624 psraw m6, 7
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
625 mova m3, m4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
626 packuswb m6, m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
627 mova m4, m5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
628 movh [r0], m6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
629
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
630 ; go to next line
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
631 add r0, r1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
632 add r2, r3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
633 dec r4 ; next row
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
634 jg .nextrow
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
635 REP_RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
636
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
637 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
638 ; IDCT functions:
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
639 ;
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
640 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
641 ;-----------------------------------------------------------------------------
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
642
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
643 cglobal vp8_idct_dc_add_mmx, 3, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
644 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
645 movd mm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
646
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
647 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
648 paddw mm0, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
649 pxor mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
650 psraw mm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
651 psubw mm1, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
652 packuswb mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
653 packuswb mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
654 punpcklbw mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
655 punpcklbw mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
656 punpcklwd mm0, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
657 punpcklwd mm1, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
658
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
659 ; add DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
660 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
661 movd mm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
662 movd mm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
663 movd mm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
664 movd mm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
665 paddusb mm2, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
666 paddusb mm3, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
667 paddusb mm4, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
668 paddusb mm5, mm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
669 psubusb mm2, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
670 psubusb mm3, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
671 psubusb mm4, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
672 psubusb mm5, mm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
673 movd [r0], mm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
674 movd [r0+r2], mm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
675 movd [r1], mm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
676 movd [r1+r2], mm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
677 RET
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
678
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
679 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
680 ; load data
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
681 movd xmm0, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
682 lea r1, [r0+r2*2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
683 pxor xmm1, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
684 movq xmm2, [pw_4]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
685
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
686 ; calculate DC
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
687 paddw xmm0, xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
688 movd xmm2, [r0]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
689 movd xmm3, [r0+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
690 movd xmm4, [r1]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
691 movd xmm5, [r1+r2]
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
692 psraw xmm0, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
693 pshuflw xmm0, xmm0, 0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
694 punpcklqdq xmm0, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
695 punpckldq xmm2, xmm3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
696 punpckldq xmm4, xmm5
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
697 punpcklbw xmm2, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
698 punpcklbw xmm4, xmm1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
699 paddw xmm2, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
700 paddw xmm4, xmm0
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
701 packuswb xmm2, xmm4
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
702 movd [r0], xmm2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
703 pextrd [r0+r2], xmm2, 1
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
704 pextrd [r1], xmm2, 2
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
705 pextrd [r1+r2], xmm2, 3
c3afb5be0d9b First shot at VP8 optimizations:
rbultje
parents:
diff changeset
706 RET