Mercurial > libavcodec.hg
annotate x86/dsputilenc_yasm.asm @ 12515:307776e26174 libavcodec
Support deinterlacing of YUVJ422P in old deinterlacer.
Patch by Maksym Veremeyenko verem at m1stereo tv.
author | banan |
---|---|
date | Sat, 25 Sep 2010 14:37:54 +0000 |
parents | 9575307cbb82 |
children |
rev | line source |
---|---|
12497
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
1 ;***************************************************************************** |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
2 ;* MMX optimized DSP utils |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
3 ;***************************************************************************** |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
4 ;* Copyright (c) 2000, 2001 Fabrice Bellard |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
5 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
6 ;* |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
7 ;* This file is part of FFmpeg. |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
8 ;* |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
9 ;* FFmpeg is free software; you can redistribute it and/or |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
10 ;* modify it under the terms of the GNU Lesser General Public |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
11 ;* License as published by the Free Software Foundation; either |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
12 ;* version 2.1 of the License, or (at your option) any later version. |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
13 ;* |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
14 ;* FFmpeg is distributed in the hope that it will be useful, |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
17 ;* Lesser General Public License for more details. |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
18 ;* |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
19 ;* You should have received a copy of the GNU Lesser General Public |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
20 ;* License along with FFmpeg; if not, write to the Free Software |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
21 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
22 ;***************************************************************************** |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
23 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
24 %include "x86inc.asm" |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
25 %include "x86util.asm" |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
26 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
27 SECTION .text |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
28 |
12498
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
29 %macro DIFF_PIXELS_1 4 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
30 movh %1, %3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
31 movh %2, %4 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
32 punpcklbw %2, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
33 punpcklbw %1, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
34 psubw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
35 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
36 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
37 ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
38 ; %6=temporary storage location |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
39 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
40 %macro DIFF_PIXELS_8 6 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
41 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
42 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
43 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
44 add %1, %5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
45 add %2, %5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
46 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
47 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
48 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
49 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
50 %ifdef m8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
51 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
52 %else |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
53 mova [%6], m0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
54 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
55 mova m0, [%6] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
56 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
57 sub %1, %5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
58 sub %2, %5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
59 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
60 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
61 %macro HADAMARD8 0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
62 SUMSUB_BADC m0, m1, m2, m3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
63 SUMSUB_BADC m4, m5, m6, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
64 SUMSUB_BADC m0, m2, m1, m3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
65 SUMSUB_BADC m4, m6, m5, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
66 SUMSUB_BADC m0, m4, m1, m5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
67 SUMSUB_BADC m2, m6, m3, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
68 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
69 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
70 %macro ABS1_SUM 3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
71 ABS1 %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
72 paddusw %3, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
73 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
74 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
75 %macro ABS2_SUM 6 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
76 ABS2 %1, %2, %3, %4 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
77 paddusw %5, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
78 paddusw %6, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
79 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
80 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
81 %macro ABS_SUM_8x8_64 1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
82 ABS2 m0, m1, m8, m9 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
83 ABS2_SUM m2, m3, m8, m9, m0, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
84 ABS2_SUM m4, m5, m8, m9, m0, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
85 ABS2_SUM m6, m7, m8, m9, m0, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
86 paddusw m0, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
87 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
88 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
89 %macro ABS_SUM_8x8_32 1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
90 mova [%1], m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
91 ABS1 m0, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
92 ABS1 m1, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
93 ABS1_SUM m2, m7, m0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
94 ABS1_SUM m3, m7, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
95 ABS1_SUM m4, m7, m0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
96 ABS1_SUM m5, m7, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
97 ABS1_SUM m6, m7, m0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
98 mova m2, [%1] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
99 ABS1_SUM m2, m7, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
100 paddusw m0, m1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
101 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
102 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
103 ; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
104 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
105 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
106 %macro HSUM_MMX 3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
107 mova %2, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
108 psrlq %1, 32 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
109 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
110 mova %2, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
111 psrlq %1, 16 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
112 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
113 movd %3, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
114 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
115 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
116 %macro HSUM_MMX2 3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
117 pshufw %2, %1, 0xE |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
118 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
119 pshufw %2, %1, 0x1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
120 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
121 movd %3, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
122 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
123 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
124 %macro HSUM_SSE2 3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
125 movhlps %2, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
126 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
127 pshuflw %2, %1, 0xE |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
128 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
129 pshuflw %2, %1, 0x1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
130 paddusw %1, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
131 movd %3, %1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
132 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
133 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
134 %macro STORE4 5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
135 mova [%1+mmsize*0], %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
136 mova [%1+mmsize*1], %3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
137 mova [%1+mmsize*2], %4 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
138 mova [%1+mmsize*3], %5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
139 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
140 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
141 %macro LOAD4 5 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
142 mova %2, [%1+mmsize*0] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
143 mova %3, [%1+mmsize*1] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
144 mova %4, [%1+mmsize*2] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
145 mova %5, [%1+mmsize*3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
146 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
147 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
148 %macro hadamard8_16_wrapper 3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
149 cglobal hadamard8_diff_%1, 4, 4, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
150 %ifndef m8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
151 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
152 SUB rsp, pad |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
153 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
154 call hadamard8x8_diff_%1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
155 %ifndef m8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
156 ADD rsp, pad |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
157 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
158 RET |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
159 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
160 cglobal hadamard8_diff16_%1, 5, 6, %2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
161 %ifndef m8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
162 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
163 SUB rsp, pad |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
164 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
165 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
166 call hadamard8x8_diff_%1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
167 mov r5d, eax |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
168 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
169 add r1, 8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
170 add r2, 8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
171 call hadamard8x8_diff_%1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
172 add r5d, eax |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
173 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
174 cmp r4d, 16 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
175 jne .done |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
176 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
177 lea r1, [r1+r3*8-8] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
178 lea r2, [r2+r3*8-8] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
179 call hadamard8x8_diff_%1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
180 add r5d, eax |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
181 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
182 add r1, 8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
183 add r2, 8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
184 call hadamard8x8_diff_%1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
185 add r5d, eax |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
186 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
187 .done |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
188 mov eax, r5d |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
189 %ifndef m8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
190 ADD rsp, pad |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
191 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
192 RET |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
193 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
194 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
195 %macro HADAMARD8_DIFF_MMX 1 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
196 ALIGN 16 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
197 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
198 ; int stride, int h) |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
199 ; r0 = void *s = unused, int h = unused (always 8) |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
200 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
201 ; can simply call this 2x2x (and that's why we access rsp+gprsize |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
202 ; everywhere, which is rsp of calling func |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
203 hadamard8x8_diff_%1: |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
204 lea r0, [r3*3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
205 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
206 ; first 4x8 pixels |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
207 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
208 HADAMARD8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
209 mova [rsp+gprsize+0x60], m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
210 TRANSPOSE4x4W 0, 1, 2, 3, 7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
211 STORE4 rsp+gprsize, m0, m1, m2, m3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
212 mova m7, [rsp+gprsize+0x60] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
213 TRANSPOSE4x4W 4, 5, 6, 7, 0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
214 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
215 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
216 ; second 4x8 pixels |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
217 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
218 HADAMARD8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
219 mova [rsp+gprsize+0x60], m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
220 TRANSPOSE4x4W 0, 1, 2, 3, 7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
221 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
222 mova m7, [rsp+gprsize+0x60] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
223 TRANSPOSE4x4W 4, 5, 6, 7, 0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
224 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
225 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
226 HADAMARD8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
227 ABS_SUM_8x8_32 rsp+gprsize+0x60 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
228 mova [rsp+gprsize+0x60], m0 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
229 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
230 LOAD4 rsp+gprsize , m0, m1, m2, m3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
231 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
232 HADAMARD8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
233 ABS_SUM_8x8_32 rsp+gprsize |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
234 paddusw m0, [rsp+gprsize+0x60] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
235 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
236 HSUM m0, m1, eax |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
237 and rax, 0xFFFF |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
238 ret |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
239 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
240 hadamard8_16_wrapper %1, 0, 14 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
241 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
242 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
243 %macro HADAMARD8_DIFF_SSE2 2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
244 hadamard8x8_diff_%1: |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
245 lea r0, [r3*3] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
246 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
247 HADAMARD8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
248 %ifdef ARCH_X86_64 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
249 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
250 %else |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
251 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
252 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
253 HADAMARD8 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
254 ABS_SUM_8x8 rsp+gprsize |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
255 HSUM_SSE2 m0, m1, eax |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
256 and eax, 0xFFFF |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
257 ret |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
258 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
259 hadamard8_16_wrapper %1, %2, 3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
260 %endmacro |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
261 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
262 INIT_MMX |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
263 %define ABS1 ABS1_MMX |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
264 %define HSUM HSUM_MMX |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
265 HADAMARD8_DIFF_MMX mmx |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
266 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
267 %define ABS1 ABS1_MMX2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
268 %define HSUM HSUM_MMX2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
269 HADAMARD8_DIFF_MMX mmx2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
270 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
271 INIT_XMM |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
272 %define ABS2 ABS2_MMX2 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
273 %ifdef ARCH_X86_64 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
274 %define ABS_SUM_8x8 ABS_SUM_8x8_64 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
275 %else |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
276 %define ABS_SUM_8x8 ABS_SUM_8x8_32 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
277 %endif |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
278 HADAMARD8_DIFF_SSE2 sse2, 10 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
279 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
280 %define ABS2 ABS2_SSSE3 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
281 %define ABS_SUM_8x8 ABS_SUM_8x8_64 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
282 HADAMARD8_DIFF_SSE2 ssse3, 9 |
c997f09d1e10
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
rbultje
parents:
12497
diff
changeset
|
283 |
12497
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
284 INIT_XMM |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
285 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
286 cglobal sse16_sse2, 5, 5, 8 |
12500
9575307cbb82
Don't access upper 32 bits of a 32-bit int on 64-bit systems.
rbultje
parents:
12498
diff
changeset
|
287 shr r4d, 1 |
12497
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
288 pxor m0, m0 ; mm0 = 0 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
289 pxor m7, m7 ; mm7 holds the sum |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
290 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
291 .next2lines ; FIXME why are these unaligned movs? pix1[] is aligned |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
292 movu m1, [r1 ] ; mm1 = pix1[0][0-15] |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
293 movu m2, [r2 ] ; mm2 = pix2[0][0-15] |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
294 movu m3, [r1+r3] ; mm3 = pix1[1][0-15] |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
295 movu m4, [r2+r3] ; mm4 = pix2[1][0-15] |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
296 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
297 ; todo: mm1-mm2, mm3-mm4 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
298 ; algo: subtract mm1 from mm2 with saturation and vice versa |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
299 ; OR the result to get the absolute difference |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
300 mova m5, m1 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
301 mova m6, m3 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
302 psubusb m1, m2 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
303 psubusb m3, m4 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
304 psubusb m2, m5 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
305 psubusb m4, m6 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
306 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
307 por m2, m1 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
308 por m4, m3 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
309 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
310 ; now convert to 16-bit vectors so we can square them |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
311 mova m1, m2 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
312 mova m3, m4 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
313 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
314 punpckhbw m2, m0 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
315 punpckhbw m4, m0 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
316 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
317 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
318 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
319 pmaddwd m2, m2 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
320 pmaddwd m4, m4 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
321 pmaddwd m1, m1 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
322 pmaddwd m3, m3 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
323 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
324 lea r1, [r1+r3*2] ; pix1 += 2*line_size |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
325 lea r2, [r2+r3*2] ; pix2 += 2*line_size |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
326 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
327 paddd m1, m2 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
328 paddd m3, m4 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
329 paddd m7, m1 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
330 paddd m7, m3 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
331 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
332 dec r4 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
333 jnz .next2lines |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
334 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
335 mova m1, m7 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
336 psrldq m7, 8 ; shift hi qword to lo |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
337 paddd m7, m1 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
338 mova m1, m7 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
339 psrldq m7, 4 ; shift hi dword to lo |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
340 paddd m7, m1 |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
341 movd eax, m7 ; return value |
c5ffa8b81f9c
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
rbultje
parents:
diff
changeset
|
342 RET |