annotate x86/dsputilenc_yasm.asm @ 12530:63edd10ad4bc libavcodec tip

Try to fix crashes introduced by r25218 r25218 made assumptions about the existence of past reference frames that weren't necessarily true.
author darkshikari
date Tue, 28 Sep 2010 09:06:22 +0000
parents 9575307cbb82
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12497
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
1 ;*****************************************************************************
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
2 ;* MMX optimized DSP utils
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
3 ;*****************************************************************************
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
4 ;* Copyright (c) 2000, 2001 Fabrice Bellard
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
5 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
6 ;*
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
7 ;* This file is part of FFmpeg.
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
8 ;*
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
9 ;* FFmpeg is free software; you can redistribute it and/or
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
10 ;* modify it under the terms of the GNU Lesser General Public
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
11 ;* License as published by the Free Software Foundation; either
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
12 ;* version 2.1 of the License, or (at your option) any later version.
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
13 ;*
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
14 ;* FFmpeg is distributed in the hope that it will be useful,
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
17 ;* Lesser General Public License for more details.
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
18 ;*
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
19 ;* You should have received a copy of the GNU Lesser General Public
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
20 ;* License along with FFmpeg; if not, write to the Free Software
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
21 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
22 ;*****************************************************************************
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
23
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
24 %include "x86inc.asm"
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
25 %include "x86util.asm"
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
26
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
27 SECTION .text
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
28
12498
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
29 %macro DIFF_PIXELS_1 4
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
30 movh %1, %3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
31 movh %2, %4
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
32 punpcklbw %2, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
33 punpcklbw %1, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
34 psubw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
35 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
36
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
37 ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
38 ; %6=temporary storage location
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
39 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
40 %macro DIFF_PIXELS_8 6
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
41 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
42 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
43 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
44 add %1, %5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
45 add %2, %5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
46 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
47 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
48 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
49 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
50 %ifdef m8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
51 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
52 %else
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
53 mova [%6], m0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
54 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
55 mova m0, [%6]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
56 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
57 sub %1, %5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
58 sub %2, %5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
59 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
60
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
61 %macro HADAMARD8 0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
62 SUMSUB_BADC m0, m1, m2, m3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
63 SUMSUB_BADC m4, m5, m6, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
64 SUMSUB_BADC m0, m2, m1, m3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
65 SUMSUB_BADC m4, m6, m5, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
66 SUMSUB_BADC m0, m4, m1, m5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
67 SUMSUB_BADC m2, m6, m3, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
68 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
69
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
70 %macro ABS1_SUM 3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
71 ABS1 %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
72 paddusw %3, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
73 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
74
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
75 %macro ABS2_SUM 6
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
76 ABS2 %1, %2, %3, %4
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
77 paddusw %5, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
78 paddusw %6, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
79 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
80
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
81 %macro ABS_SUM_8x8_64 1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
82 ABS2 m0, m1, m8, m9
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
83 ABS2_SUM m2, m3, m8, m9, m0, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
84 ABS2_SUM m4, m5, m8, m9, m0, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
85 ABS2_SUM m6, m7, m8, m9, m0, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
86 paddusw m0, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
87 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
88
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
89 %macro ABS_SUM_8x8_32 1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
90 mova [%1], m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
91 ABS1 m0, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
92 ABS1 m1, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
93 ABS1_SUM m2, m7, m0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
94 ABS1_SUM m3, m7, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
95 ABS1_SUM m4, m7, m0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
96 ABS1_SUM m5, m7, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
97 ABS1_SUM m6, m7, m0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
98 mova m2, [%1]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
99 ABS1_SUM m2, m7, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
100 paddusw m0, m1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
101 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
102
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
103 ; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
104 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
105 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
106 %macro HSUM_MMX 3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
107 mova %2, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
108 psrlq %1, 32
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
109 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
110 mova %2, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
111 psrlq %1, 16
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
112 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
113 movd %3, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
114 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
115
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
116 %macro HSUM_MMX2 3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
117 pshufw %2, %1, 0xE
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
118 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
119 pshufw %2, %1, 0x1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
120 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
121 movd %3, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
122 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
123
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
124 %macro HSUM_SSE2 3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
125 movhlps %2, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
126 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
127 pshuflw %2, %1, 0xE
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
128 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
129 pshuflw %2, %1, 0x1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
130 paddusw %1, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
131 movd %3, %1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
132 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
133
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
134 %macro STORE4 5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
135 mova [%1+mmsize*0], %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
136 mova [%1+mmsize*1], %3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
137 mova [%1+mmsize*2], %4
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
138 mova [%1+mmsize*3], %5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
139 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
140
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
141 %macro LOAD4 5
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
142 mova %2, [%1+mmsize*0]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
143 mova %3, [%1+mmsize*1]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
144 mova %4, [%1+mmsize*2]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
145 mova %5, [%1+mmsize*3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
146 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
147
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
148 %macro hadamard8_16_wrapper 3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
149 cglobal hadamard8_diff_%1, 4, 4, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
150 %ifndef m8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
151 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
152 SUB rsp, pad
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
153 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
154 call hadamard8x8_diff_%1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
155 %ifndef m8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
156 ADD rsp, pad
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
157 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
158 RET
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
159
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
160 cglobal hadamard8_diff16_%1, 5, 6, %2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
161 %ifndef m8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
162 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
163 SUB rsp, pad
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
164 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
165
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
166 call hadamard8x8_diff_%1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
167 mov r5d, eax
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
168
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
169 add r1, 8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
170 add r2, 8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
171 call hadamard8x8_diff_%1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
172 add r5d, eax
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
173
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
174 cmp r4d, 16
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
175 jne .done
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
176
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
177 lea r1, [r1+r3*8-8]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
178 lea r2, [r2+r3*8-8]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
179 call hadamard8x8_diff_%1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
180 add r5d, eax
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
181
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
182 add r1, 8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
183 add r2, 8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
184 call hadamard8x8_diff_%1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
185 add r5d, eax
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
186
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
187 .done
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
188 mov eax, r5d
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
189 %ifndef m8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
190 ADD rsp, pad
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
191 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
192 RET
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
193 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
194
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
195 %macro HADAMARD8_DIFF_MMX 1
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
196 ALIGN 16
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
197 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
198 ; int stride, int h)
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
199 ; r0 = void *s = unused, int h = unused (always 8)
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
200 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
201 ; can simply call this 2x2x (and that's why we access rsp+gprsize
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
202 ; everywhere, which is rsp of calling func
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
203 hadamard8x8_diff_%1:
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
204 lea r0, [r3*3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
205
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
206 ; first 4x8 pixels
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
207 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
208 HADAMARD8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
209 mova [rsp+gprsize+0x60], m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
210 TRANSPOSE4x4W 0, 1, 2, 3, 7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
211 STORE4 rsp+gprsize, m0, m1, m2, m3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
212 mova m7, [rsp+gprsize+0x60]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
213 TRANSPOSE4x4W 4, 5, 6, 7, 0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
214 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
215
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
216 ; second 4x8 pixels
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
217 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
218 HADAMARD8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
219 mova [rsp+gprsize+0x60], m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
220 TRANSPOSE4x4W 0, 1, 2, 3, 7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
221 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
222 mova m7, [rsp+gprsize+0x60]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
223 TRANSPOSE4x4W 4, 5, 6, 7, 0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
224
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
225 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
226 HADAMARD8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
227 ABS_SUM_8x8_32 rsp+gprsize+0x60
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
228 mova [rsp+gprsize+0x60], m0
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
229
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
230 LOAD4 rsp+gprsize , m0, m1, m2, m3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
231 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
232 HADAMARD8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
233 ABS_SUM_8x8_32 rsp+gprsize
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
234 paddusw m0, [rsp+gprsize+0x60]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
235
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
236 HSUM m0, m1, eax
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
237 and rax, 0xFFFF
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
238 ret
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
239
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
240 hadamard8_16_wrapper %1, 0, 14
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
241 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
242
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
243 %macro HADAMARD8_DIFF_SSE2 2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
244 hadamard8x8_diff_%1:
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
245 lea r0, [r3*3]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
246 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
247 HADAMARD8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
248 %ifdef ARCH_X86_64
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
249 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
250 %else
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
251 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
252 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
253 HADAMARD8
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
254 ABS_SUM_8x8 rsp+gprsize
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
255 HSUM_SSE2 m0, m1, eax
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
256 and eax, 0xFFFF
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
257 ret
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
258
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
259 hadamard8_16_wrapper %1, %2, 3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
260 %endmacro
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
261
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
262 INIT_MMX
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
263 %define ABS1 ABS1_MMX
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
264 %define HSUM HSUM_MMX
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
265 HADAMARD8_DIFF_MMX mmx
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
266
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
267 %define ABS1 ABS1_MMX2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
268 %define HSUM HSUM_MMX2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
269 HADAMARD8_DIFF_MMX mmx2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
270
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
271 INIT_XMM
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
272 %define ABS2 ABS2_MMX2
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
273 %ifdef ARCH_X86_64
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
274 %define ABS_SUM_8x8 ABS_SUM_8x8_64
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
275 %else
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
276 %define ABS_SUM_8x8 ABS_SUM_8x8_32
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
277 %endif
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
278 HADAMARD8_DIFF_SSE2 sse2, 10
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
279
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
280 %define ABS2 ABS2_SSSE3
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
281 %define ABS_SUM_8x8 ABS_SUM_8x8_64
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
282 HADAMARD8_DIFF_SSE2 ssse3, 9
c997f09d1e10 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents: 12497
diff changeset
283
12497
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
284 INIT_XMM
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
285 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
286 cglobal sse16_sse2, 5, 5, 8
12500
9575307cbb82 Don't access upper 32 bits of a 32-bit int on 64-bit systems.
rbultje
parents: 12498
diff changeset
287 shr r4d, 1
12497
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
288 pxor m0, m0 ; mm0 = 0
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
289 pxor m7, m7 ; mm7 holds the sum
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
290
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
291 .next2lines ; FIXME why are these unaligned movs? pix1[] is aligned
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
292 movu m1, [r1 ] ; mm1 = pix1[0][0-15]
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
293 movu m2, [r2 ] ; mm2 = pix2[0][0-15]
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
294 movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
295 movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
296
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
297 ; todo: mm1-mm2, mm3-mm4
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
298 ; algo: subtract mm1 from mm2 with saturation and vice versa
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
299 ; OR the result to get the absolute difference
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
300 mova m5, m1
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
301 mova m6, m3
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
302 psubusb m1, m2
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
303 psubusb m3, m4
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
304 psubusb m2, m5
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
305 psubusb m4, m6
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
306
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
307 por m2, m1
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
308 por m4, m3
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
309
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
310 ; now convert to 16-bit vectors so we can square them
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
311 mova m1, m2
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
312 mova m3, m4
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
313
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
314 punpckhbw m2, m0
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
315 punpckhbw m4, m0
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
316 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
317 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
318
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
319 pmaddwd m2, m2
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
320 pmaddwd m4, m4
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
321 pmaddwd m1, m1
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
322 pmaddwd m3, m3
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
323
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
324 lea r1, [r1+r3*2] ; pix1 += 2*line_size
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
325 lea r2, [r2+r3*2] ; pix2 += 2*line_size
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
326
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
327 paddd m1, m2
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
328 paddd m3, m4
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
329 paddd m7, m1
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
330 paddd m7, m3
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
331
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
332 dec r4
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
333 jnz .next2lines
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
334
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
335 mova m1, m7
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
336 psrldq m7, 8 ; shift hi qword to lo
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
337 paddd m7, m1
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
338 mova m1, m7
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
339 psrldq m7, 4 ; shift hi dword to lo
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
340 paddd m7, m1
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
341 movd eax, m7 ; return value
c5ffa8b81f9c Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff changeset
342 RET