annotate x86/dsputil_yasm.asm @ 11339:a82cdda1f507 libavcodec

Merge weight & offset tables, 15 cpu cycles faster.
author michael
date Wed, 03 Mar 2010 14:41:43 +0000
parents abb3b23bda35
children 980030a3e315
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
1 ;******************************************************************************
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
2 ;* MMX optimized DSP utils
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
3 ;* Copyright (c) 2008 Loren Merritt
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
4 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
5 ;* This file is part of FFmpeg.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
6 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
7 ;* FFmpeg is free software; you can redistribute it and/or
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
8 ;* modify it under the terms of the GNU Lesser General Public
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
9 ;* License as published by the Free Software Foundation; either
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
10 ;* version 2.1 of the License, or (at your option) any later version.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
11 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
12 ;* FFmpeg is distributed in the hope that it will be useful,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
15 ;* Lesser General Public License for more details.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
16 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
17 ;* You should have received a copy of the GNU Lesser General Public
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
18 ;* License along with FFmpeg; if not, write to the Free Software
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
20 ;******************************************************************************
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
21
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
22 %include "x86inc.asm"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
23
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
24 SECTION_RODATA
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
25 pb_f: times 16 db 15
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
26 pb_zzzzzzzz77777777: times 8 db -1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
27 pb_7: times 8 db 7
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
30
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
31 section .text align=16
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
32
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
33 %macro PSWAPD_SSE 2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
34 pshufw %1, %2, 0x4e
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
35 %endmacro
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
36 %macro PSWAPD_3DN1 2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
37 movq %1, %2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
38 psrlq %1, 32
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
39 punpckldq %1, %2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
40 %endmacro
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
41
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
42 %macro FLOAT_TO_INT16_INTERLEAVE6 1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
43 ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
44 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
45 %ifdef ARCH_X86_64
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
46 %define lend r10d
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
47 mov lend, r2d
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
48 %else
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
49 %define lend dword r2m
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
50 %endif
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
51 mov src1q, [srcq+1*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
52 mov src2q, [srcq+2*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
53 mov src3q, [srcq+3*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
54 mov src4q, [srcq+4*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
55 mov src5q, [srcq+5*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
56 mov srcq, [srcq]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
57 sub src1q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
58 sub src2q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
59 sub src3q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
60 sub src4q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
61 sub src5q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
62 .loop:
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
63 cvtps2pi mm0, [srcq]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
64 cvtps2pi mm1, [srcq+src1q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
65 cvtps2pi mm2, [srcq+src2q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
66 cvtps2pi mm3, [srcq+src3q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
67 cvtps2pi mm4, [srcq+src4q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
68 cvtps2pi mm5, [srcq+src5q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
69 packssdw mm0, mm3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
70 packssdw mm1, mm4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
71 packssdw mm2, mm5
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
72 pswapd mm3, mm0
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
73 punpcklwd mm0, mm1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
74 punpckhwd mm1, mm2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
75 punpcklwd mm2, mm3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
76 pswapd mm3, mm0
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
77 punpckldq mm0, mm2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
78 punpckhdq mm2, mm1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
79 punpckldq mm1, mm3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
80 movq [dstq ], mm0
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
81 movq [dstq+16], mm2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
82 movq [dstq+ 8], mm1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
83 add srcq, 8
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
84 add dstq, 24
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
85 sub lend, 2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
86 jg .loop
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
87 emms
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
88 RET
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
89 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
90
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
91 %define pswapd PSWAPD_SSE
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
92 FLOAT_TO_INT16_INTERLEAVE6 sse
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
93 %define cvtps2pi pf2id
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
94 %define pswapd PSWAPD_3DN1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
95 FLOAT_TO_INT16_INTERLEAVE6 3dnow
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
96 %undef pswapd
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
97 FLOAT_TO_INT16_INTERLEAVE6 3dn2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
98 %undef cvtps2pi
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
99
8760
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
100
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
101
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
102 %macro SCALARPRODUCT 1
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
105 shl orderq, 1
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
106 add v1q, orderq
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
107 add v2q, orderq
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
108 neg orderq
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
109 movd m3, shiftm
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
110 pxor m2, m2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
111 .loop:
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
112 movu m0, [v1q + orderq]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
113 movu m1, [v1q + orderq + mmsize]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
114 pmaddwd m0, [v2q + orderq]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
115 pmaddwd m1, [v2q + orderq + mmsize]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
116 paddd m2, m0
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
117 paddd m2, m1
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
118 add orderq, mmsize*2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
119 jl .loop
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
120 %if mmsize == 16
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
121 movhlps m0, m2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
122 paddd m2, m0
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
123 psrad m2, m3
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
124 pshuflw m0, m2, 0x4e
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
125 %else
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
126 psrad m2, m3
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
127 pshufw m0, m2, 0x4e
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
128 %endif
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
129 paddd m2, m0
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
130 movd eax, m2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
131 RET
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
132
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
10660
f0f34732208a fix a crash in ape decoding on x86_32 sse2
lorenm
parents: 10646
diff changeset
134 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
135 shl orderq, 1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
136 movd m7, mulm
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
137 %if mmsize == 16
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
138 pshuflw m7, m7, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
139 punpcklqdq m7, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
140 %else
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
141 pshufw m7, m7, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
142 %endif
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
143 pxor m6, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
144 add v1q, orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
145 add v2q, orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
146 add v3q, orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
147 neg orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
148 .loop:
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
149 movu m0, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
150 movu m1, [v2q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
151 mova m4, [v1q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
152 mova m5, [v1q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
153 movu m2, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
154 movu m3, [v3q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
155 pmaddwd m0, m4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
156 pmaddwd m1, m5
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
157 pmullw m2, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
158 pmullw m3, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
159 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
160 paddd m6, m1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
161 paddw m2, m4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
162 paddw m3, m5
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
163 mova [v1q + orderq], m2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
164 mova [v1q + orderq + mmsize], m3
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
165 add orderq, mmsize*2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
166 jl .loop
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
167 %if mmsize == 16
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
168 movhlps m0, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
169 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
170 pshuflw m0, m6, 0x4e
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
171 %else
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
172 pshufw m0, m6, 0x4e
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
173 %endif
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
174 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
175 movd eax, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
176 RET
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
177 %endmacro
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
178
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
179 INIT_MMX
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
180 SCALARPRODUCT mmx2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
181 INIT_XMM
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
182 SCALARPRODUCT sse2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
183
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
184 %macro SCALARPRODUCT_LOOP 1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
185 align 16
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
186 .loop%1:
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
187 sub orderq, mmsize*2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
188 %if %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
189 mova m1, m4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
190 mova m4, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
191 mova m0, [v2q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
192 palignr m1, m0, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
193 palignr m0, m4, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
194 mova m3, m5
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
195 mova m5, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
196 mova m2, [v3q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
197 palignr m3, m2, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
198 palignr m2, m5, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
199 %else
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
200 mova m0, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
201 mova m1, [v2q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
202 mova m2, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
203 mova m3, [v3q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
204 %endif
10646
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
205 %define t0 [v1q + orderq]
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
206 %define t1 [v1q + orderq + mmsize]
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
207 %ifdef ARCH_X86_64
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
208 mova m8, t0
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
209 mova m9, t1
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
210 %define t0 m8
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
211 %define t1 m9
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
212 %endif
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
213 pmaddwd m0, t0
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
214 pmaddwd m1, t1
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
215 pmullw m2, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
216 pmullw m3, m7
10646
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
217 paddw m2, t0
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
218 paddw m3, t1
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
219 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
220 paddd m6, m1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
221 mova [v1q + orderq], m2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
222 mova [v1q + orderq + mmsize], m3
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
223 jg .loop%1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
224 %if %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
225 jmp .end
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
226 %endif
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
227 %endmacro
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
228
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
229 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
10646
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
230 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
231 shl orderq, 1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
232 movd m7, mulm
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
233 pshuflw m7, m7, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
234 punpcklqdq m7, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
235 pxor m6, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
236 mov r4d, v2d
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
237 and r4d, 15
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
238 and v2q, ~15
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
239 and v3q, ~15
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
240 mova m4, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
241 mova m5, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
242 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
243 cmp r4d, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
244 je .loop0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
245 cmp r4d, 2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
246 je .loop2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
247 cmp r4d, 4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
248 je .loop4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
249 cmp r4d, 6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
250 je .loop6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
251 cmp r4d, 8
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
252 je .loop8
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
253 cmp r4d, 10
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
254 je .loop10
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
255 cmp r4d, 12
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
256 je .loop12
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
257 SCALARPRODUCT_LOOP 14
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
258 SCALARPRODUCT_LOOP 12
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
259 SCALARPRODUCT_LOOP 10
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
260 SCALARPRODUCT_LOOP 8
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
261 SCALARPRODUCT_LOOP 6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
262 SCALARPRODUCT_LOOP 4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
263 SCALARPRODUCT_LOOP 2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
264 SCALARPRODUCT_LOOP 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
265 .end:
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
266 movhlps m0, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
267 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
268 pshuflw m0, m6, 0x4e
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
269 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
270 movd eax, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
271 RET
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
272
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
273
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
274
10431
546b7ebeaf07 huffyuv: add some const qualifiers
lorenm
parents: 10430
diff changeset
275 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
8760
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
276 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
277 movq mm0, [topq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
278 movq mm2, mm0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
279 movd mm4, [left_topq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
280 psllq mm2, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
281 movq mm1, mm0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
282 por mm4, mm2
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
283 movd mm3, [leftq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
284 psubb mm0, mm4 ; t-tl
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
285 add dstq, wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
286 add topq, wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
287 add diffq, wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
288 neg wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
289 jmp .skip
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
290 .loop:
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
291 movq mm4, [topq+wq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
292 movq mm0, mm4
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
293 psllq mm4, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
294 por mm4, mm1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
295 movq mm1, mm0 ; t
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
296 psubb mm0, mm4 ; t-tl
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
297 .skip:
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
298 movq mm2, [diffq+wq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
299 %assign i 0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
300 %rep 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
301 movq mm4, mm0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
302 paddb mm4, mm3 ; t-tl+l
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
303 movq mm5, mm3
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
304 pmaxub mm3, mm1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
305 pminub mm5, mm1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
306 pminub mm3, mm4
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
307 pmaxub mm3, mm5 ; median
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
308 paddb mm3, mm2 ; +residual
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
309 %if i==0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
310 movq mm7, mm3
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
311 psllq mm7, 56
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
312 %else
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
313 movq mm6, mm3
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
314 psrlq mm7, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
315 psllq mm6, 56
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
316 por mm7, mm6
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
317 %endif
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
318 %if i<7
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
319 psrlq mm0, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
320 psrlq mm1, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
321 psrlq mm2, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
322 %endif
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
323 %assign i i+1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
324 %endrep
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
325 movq [dstq+wq], mm7
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
326 add wq, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
327 jl .loop
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
328 movzx r2d, byte [dstq-1]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
329 mov [leftq], r2d
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
330 movzx r2d, byte [topq-1]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
331 mov [left_topq], r2d
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
332 RET
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
333
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
334
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
335 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
336 add srcq, wq
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
337 add dstq, wq
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
338 neg wq
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
339 %%.loop:
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
340 mova m1, [srcq+wq]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
341 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
342 psllw m1, 8
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
343 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
344 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
345 pshufb m1, m3
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
346 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
347 pshufb m0, m5
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
348 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
349 pshufb m1, m4
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
350 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
351 %if mmsize == 16
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
352 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
353 pshufb m1, m6
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
354 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
355 %endif
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
356 paddb m0, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
357 %if %1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
358 mova [dstq+wq], m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
359 %else
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
360 movq [dstq+wq], m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
361 movhps [dstq+wq+8], m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
362 %endif
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
363 add wq, mmsize
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
364 jl %%.loop
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
365 mov eax, mmsize-1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
366 sub eax, wd
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
367 movd m1, eax
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
368 pshufb m0, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
369 movd eax, m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
370 RET
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
371 %endmacro
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
372
10431
546b7ebeaf07 huffyuv: add some const qualifiers
lorenm
parents: 10430
diff changeset
373 ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
374 INIT_MMX
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
375 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
376 .skip_prologue:
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
377 mova m5, [pb_7 GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
378 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
379 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
380 movd m0, leftm
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
381 psllq m0, 56
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
382 ADD_HFYU_LEFT_LOOP 1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
383
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
384 INIT_XMM
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
385 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
386 mova m5, [pb_f GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
387 mova m6, [pb_zzzzzzzz77777777 GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
388 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
389 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
390 movd m0, leftm
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
391 pslldq m0, 15
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
392 test srcq, 15
10434
276b3a342389 fix linking on systems with a function name prefix (10l in r20287)
lorenm
parents: 10431
diff changeset
393 jnz add_hfyu_left_prediction_ssse3.skip_prologue
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
394 test dstq, 15
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
395 jnz .unaligned
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
396 ADD_HFYU_LEFT_LOOP 1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
397 .unaligned:
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
398 ADD_HFYU_LEFT_LOOP 0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
399
10964
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
400
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
401 ; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
402 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
403 neg offsetq
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
404 shl offsetq, 2
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
405 sub v1q, offsetq
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
406 sub v2q, offsetq
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
407 xorps xmm0, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
408 .loop:
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
409 movaps xmm1, [v1q+offsetq]
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
410 mulps xmm1, [v2q+offsetq]
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
411 addps xmm0, xmm1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
412 add offsetq, 16
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
413 js .loop
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
414 movhlps xmm1, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
415 addps xmm0, xmm1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
416 movss xmm1, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
417 shufps xmm0, xmm0, 1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
418 addss xmm0, xmm1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
419 %ifndef ARCH_X86_64
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
420 movd r0m, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
421 fld dword r0m
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
422 %endif
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
423 RET