annotate x86/dsputil_yasm.asm @ 12530:63edd10ad4bc libavcodec tip

Try to fix crashes introduced by r25218 r25218 made assumptions about the existence of past reference frames that weren't necessarily true.
author darkshikari
date Tue, 28 Sep 2010 09:06:22 +0000
parents 980030a3e315
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
1 ;******************************************************************************
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
2 ;* MMX optimized DSP utils
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
3 ;* Copyright (c) 2008 Loren Merritt
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
4 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
5 ;* This file is part of FFmpeg.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
6 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
7 ;* FFmpeg is free software; you can redistribute it and/or
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
8 ;* modify it under the terms of the GNU Lesser General Public
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
9 ;* License as published by the Free Software Foundation; either
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
10 ;* version 2.1 of the License, or (at your option) any later version.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
11 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
12 ;* FFmpeg is distributed in the hope that it will be useful,
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
15 ;* Lesser General Public License for more details.
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
16 ;*
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
17 ;* You should have received a copy of the GNU Lesser General Public
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
18 ;* License along with FFmpeg; if not, write to the Free Software
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
20 ;******************************************************************************
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
21
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
22 %include "x86inc.asm"
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
23
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
24 SECTION_RODATA
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
25 pb_f: times 16 db 15
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
26 pb_zzzzzzzz77777777: times 8 db -1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
27 pb_7: times 8 db 7
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
30
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
31 section .text align=16
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
32
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
33 %macro PSWAPD_SSE 2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
34 pshufw %1, %2, 0x4e
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
35 %endmacro
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
36 %macro PSWAPD_3DN1 2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
37 movq %1, %2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
38 psrlq %1, 32
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
39 punpckldq %1, %2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
40 %endmacro
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
41
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
42 %macro FLOAT_TO_INT16_INTERLEAVE6 1
11931
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
43 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
8430
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
44 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
45 %ifdef ARCH_X86_64
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
46 %define lend r10d
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
47 mov lend, r2d
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
48 %else
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
49 %define lend dword r2m
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
50 %endif
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
51 mov src1q, [srcq+1*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
52 mov src2q, [srcq+2*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
53 mov src3q, [srcq+3*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
54 mov src4q, [srcq+4*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
55 mov src5q, [srcq+5*gprsize]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
56 mov srcq, [srcq]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
57 sub src1q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
58 sub src2q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
59 sub src3q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
60 sub src4q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
61 sub src5q, srcq
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
62 .loop:
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
63 cvtps2pi mm0, [srcq]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
64 cvtps2pi mm1, [srcq+src1q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
65 cvtps2pi mm2, [srcq+src2q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
66 cvtps2pi mm3, [srcq+src3q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
67 cvtps2pi mm4, [srcq+src4q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
68 cvtps2pi mm5, [srcq+src5q]
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
69 packssdw mm0, mm3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
70 packssdw mm1, mm4
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
71 packssdw mm2, mm5
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
72 pswapd mm3, mm0
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
73 punpcklwd mm0, mm1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
74 punpckhwd mm1, mm2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
75 punpcklwd mm2, mm3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
76 pswapd mm3, mm0
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
77 punpckldq mm0, mm2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
78 punpckhdq mm2, mm1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
79 punpckldq mm1, mm3
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
80 movq [dstq ], mm0
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
81 movq [dstq+16], mm2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
82 movq [dstq+ 8], mm1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
83 add srcq, 8
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
84 add dstq, 24
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
85 sub lend, 2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
86 jg .loop
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
87 emms
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
88 RET
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
89 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
90
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
91 %define pswapd PSWAPD_SSE
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
92 FLOAT_TO_INT16_INTERLEAVE6 sse
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
93 %define cvtps2pi pf2id
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
94 %define pswapd PSWAPD_3DN1
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
95 FLOAT_TO_INT16_INTERLEAVE6 3dnow
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
96 %undef pswapd
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
97 FLOAT_TO_INT16_INTERLEAVE6 3dn2
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
98 %undef cvtps2pi
7768bdfd4f7b Rename libavcodec/i386/ --> libavcodec/x86/.
diego
parents:
diff changeset
99
8760
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
100
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
101
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
102 %macro SCALARPRODUCT 1
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
105 shl orderq, 1
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
106 add v1q, orderq
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
107 add v2q, orderq
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
108 neg orderq
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
109 movd m3, shiftm
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
110 pxor m2, m2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
111 .loop:
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
112 movu m0, [v1q + orderq]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
113 movu m1, [v1q + orderq + mmsize]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
114 pmaddwd m0, [v2q + orderq]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
115 pmaddwd m1, [v2q + orderq + mmsize]
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
116 paddd m2, m0
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
117 paddd m2, m1
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
118 add orderq, mmsize*2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
119 jl .loop
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
120 %if mmsize == 16
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
121 movhlps m0, m2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
122 paddd m2, m0
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
123 psrad m2, m3
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
124 pshuflw m0, m2, 0x4e
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
125 %else
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
126 psrad m2, m3
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
127 pshufw m0, m2, 0x4e
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
128 %endif
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
129 paddd m2, m0
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
130 movd eax, m2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
131 RET
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
132
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
10660
f0f34732208a fix a crash in ape decoding on x86_32 sse2
lorenm
parents: 10646
diff changeset
134 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
135 shl orderq, 1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
136 movd m7, mulm
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
137 %if mmsize == 16
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
138 pshuflw m7, m7, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
139 punpcklqdq m7, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
140 %else
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
141 pshufw m7, m7, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
142 %endif
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
143 pxor m6, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
144 add v1q, orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
145 add v2q, orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
146 add v3q, orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
147 neg orderq
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
148 .loop:
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
149 movu m0, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
150 movu m1, [v2q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
151 mova m4, [v1q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
152 mova m5, [v1q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
153 movu m2, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
154 movu m3, [v3q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
155 pmaddwd m0, m4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
156 pmaddwd m1, m5
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
157 pmullw m2, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
158 pmullw m3, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
159 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
160 paddd m6, m1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
161 paddw m2, m4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
162 paddw m3, m5
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
163 mova [v1q + orderq], m2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
164 mova [v1q + orderq + mmsize], m3
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
165 add orderq, mmsize*2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
166 jl .loop
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
167 %if mmsize == 16
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
168 movhlps m0, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
169 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
170 pshuflw m0, m6, 0x4e
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
171 %else
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
172 pshufw m0, m6, 0x4e
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
173 %endif
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
174 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
175 movd eax, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
176 RET
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
177 %endmacro
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
178
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
179 INIT_MMX
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
180 SCALARPRODUCT mmx2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
181 INIT_XMM
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
182 SCALARPRODUCT sse2
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
183
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
184 %macro SCALARPRODUCT_LOOP 1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
185 align 16
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
186 .loop%1:
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
187 sub orderq, mmsize*2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
188 %if %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
189 mova m1, m4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
190 mova m4, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
191 mova m0, [v2q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
192 palignr m1, m0, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
193 palignr m0, m4, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
194 mova m3, m5
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
195 mova m5, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
196 mova m2, [v3q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
197 palignr m3, m2, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
198 palignr m2, m5, %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
199 %else
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
200 mova m0, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
201 mova m1, [v2q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
202 mova m2, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
203 mova m3, [v3q + orderq + mmsize]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
204 %endif
10646
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
205 %define t0 [v1q + orderq]
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
206 %define t1 [v1q + orderq + mmsize]
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
207 %ifdef ARCH_X86_64
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
208 mova m8, t0
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
209 mova m9, t1
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
210 %define t0 m8
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
211 %define t1 m9
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
212 %endif
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
213 pmaddwd m0, t0
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
214 pmaddwd m1, t1
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
215 pmullw m2, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
216 pmullw m3, m7
10646
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
217 paddw m2, t0
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
218 paddw m3, t1
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
219 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
220 paddd m6, m1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
221 mova [v1q + orderq], m2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
222 mova [v1q + orderq + mmsize], m3
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
223 jg .loop%1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
224 %if %1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
225 jmp .end
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
226 %endif
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
227 %endmacro
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
228
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
229 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
10646
bb14c1586891 slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents: 10644
diff changeset
230 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
231 shl orderq, 1
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
232 movd m7, mulm
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
233 pshuflw m7, m7, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
234 punpcklqdq m7, m7
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
235 pxor m6, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
236 mov r4d, v2d
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
237 and r4d, 15
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
238 and v2q, ~15
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
239 and v3q, ~15
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
240 mova m4, [v2q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
241 mova m5, [v3q + orderq]
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
242 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
243 cmp r4d, 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
244 je .loop0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
245 cmp r4d, 2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
246 je .loop2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
247 cmp r4d, 4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
248 je .loop4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
249 cmp r4d, 6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
250 je .loop6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
251 cmp r4d, 8
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
252 je .loop8
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
253 cmp r4d, 10
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
254 je .loop10
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
255 cmp r4d, 12
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
256 je .loop12
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
257 SCALARPRODUCT_LOOP 14
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
258 SCALARPRODUCT_LOOP 12
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
259 SCALARPRODUCT_LOOP 10
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
260 SCALARPRODUCT_LOOP 8
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
261 SCALARPRODUCT_LOOP 6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
262 SCALARPRODUCT_LOOP 4
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
263 SCALARPRODUCT_LOOP 2
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
264 SCALARPRODUCT_LOOP 0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
265 .end:
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
266 movhlps m0, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
267 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
268 pshuflw m0, m6, 0x4e
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
269 paddd m6, m0
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
270 movd eax, m6
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
271 RET
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10633
diff changeset
272
10633
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
273
66242b8fbd32 port ape dsp functions from sse2 to mmx
lorenm
parents: 10434
diff changeset
274
11931
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
275 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
8760
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
276 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
277 movq mm0, [topq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
278 movq mm2, mm0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
279 movd mm4, [left_topq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
280 psllq mm2, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
281 movq mm1, mm0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
282 por mm4, mm2
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
283 movd mm3, [leftq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
284 psubb mm0, mm4 ; t-tl
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
285 add dstq, wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
286 add topq, wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
287 add diffq, wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
288 neg wq
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
289 jmp .skip
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
290 .loop:
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
291 movq mm4, [topq+wq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
292 movq mm0, mm4
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
293 psllq mm4, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
294 por mm4, mm1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
295 movq mm1, mm0 ; t
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
296 psubb mm0, mm4 ; t-tl
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
297 .skip:
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
298 movq mm2, [diffq+wq]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
299 %assign i 0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
300 %rep 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
301 movq mm4, mm0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
302 paddb mm4, mm3 ; t-tl+l
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
303 movq mm5, mm3
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
304 pmaxub mm3, mm1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
305 pminub mm5, mm1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
306 pminub mm3, mm4
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
307 pmaxub mm3, mm5 ; median
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
308 paddb mm3, mm2 ; +residual
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
309 %if i==0
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
310 movq mm7, mm3
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
311 psllq mm7, 56
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
312 %else
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
313 movq mm6, mm3
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
314 psrlq mm7, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
315 psllq mm6, 56
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
316 por mm7, mm6
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
317 %endif
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
318 %if i<7
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
319 psrlq mm0, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
320 psrlq mm1, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
321 psrlq mm2, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
322 %endif
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
323 %assign i i+1
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
324 %endrep
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
325 movq [dstq+wq], mm7
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
326 add wq, 8
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
327 jl .loop
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
328 movzx r2d, byte [dstq-1]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
329 mov [leftq], r2d
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
330 movzx r2d, byte [topq-1]
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
331 mov [left_topq], r2d
31138c296ac6 ff_add_hfyu_median_prediction_mmx2
lorenm
parents: 8430
diff changeset
332 RET
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
333
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
334
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
335 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
336 add srcq, wq
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
337 add dstq, wq
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
338 neg wq
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
339 %%.loop:
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
340 mova m1, [srcq+wq]
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
341 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
342 psllw m1, 8
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
343 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
344 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
345 pshufb m1, m3
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
346 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
347 pshufb m0, m5
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
348 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
349 pshufb m1, m4
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
350 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
351 %if mmsize == 16
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
352 mova m2, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
353 pshufb m1, m6
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
354 paddb m1, m2
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
355 %endif
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
356 paddb m0, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
357 %if %1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
358 mova [dstq+wq], m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
359 %else
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
360 movq [dstq+wq], m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
361 movhps [dstq+wq+8], m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
362 %endif
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
363 add wq, mmsize
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
364 jl %%.loop
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
365 mov eax, mmsize-1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
366 sub eax, wd
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
367 movd m1, eax
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
368 pshufb m0, m1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
369 movd eax, m0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
370 RET
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
371 %endmacro
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
372
11931
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
373 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
374 INIT_MMX
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
375 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
376 .skip_prologue:
11931
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
377 mova m5, [pb_7]
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
378 mova m4, [pb_zzzz3333zzzzbbbb]
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
379 mova m3, [pb_zz11zz55zz99zzdd]
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
380 movd m0, leftm
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
381 psllq m0, 56
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
382 ADD_HFYU_LEFT_LOOP 1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
383
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
384 INIT_XMM
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
385 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
11931
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
386 mova m5, [pb_f]
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
387 mova m6, [pb_zzzzzzzz77777777]
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
388 mova m4, [pb_zzzz3333zzzzbbbb]
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
389 mova m3, [pb_zz11zz55zz99zzdd]
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
390 movd m0, leftm
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
391 pslldq m0, 15
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
392 test srcq, 15
10434
276b3a342389 fix linking on systems with a function name prefix (10l in r20287)
lorenm
parents: 10431
diff changeset
393 jnz add_hfyu_left_prediction_ssse3.skip_prologue
10430
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
394 test dstq, 15
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
395 jnz .unaligned
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
396 ADD_HFYU_LEFT_LOOP 1
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
397 .unaligned:
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
398 ADD_HFYU_LEFT_LOOP 0
12c8175d6db5 simd add_hfyu_left_prediction
lorenm
parents: 8760
diff changeset
399
10964
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
400
11931
980030a3e315 Update x264asm header files to latest versions.
darkshikari
parents: 10964
diff changeset
401 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
10964
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
402 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
403 neg offsetq
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
404 shl offsetq, 2
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
405 sub v1q, offsetq
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
406 sub v2q, offsetq
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
407 xorps xmm0, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
408 .loop:
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
409 movaps xmm1, [v1q+offsetq]
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
410 mulps xmm1, [v2q+offsetq]
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
411 addps xmm0, xmm1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
412 add offsetq, 16
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
413 js .loop
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
414 movhlps xmm1, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
415 addps xmm0, xmm1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
416 movss xmm1, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
417 shufps xmm0, xmm0, 1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
418 addss xmm0, xmm1
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
419 %ifndef ARCH_X86_64
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
420 movd r0m, xmm0
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
421 fld dword r0m
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
422 %endif
abb3b23bda35 Implement an sse version of scalarproduct_float().
alexc
parents: 10660
diff changeset
423 RET