Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_qns.h @ 7508:f516e1101167 libavcodec
Simplify rms(): merge a few operations in the same statement
author | vitor |
---|---|
date | Wed, 06 Aug 2008 05:11:46 +0000 |
parents | 33896780c612 |
children | eebc7209c47f |
rev | line source |
---|---|
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
1 /* |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
2 * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
3 * Copyright (c) 2004 Michael Niedermayer |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
4 * |
5214 | 5 * MMX optimization by Michael Niedermayer <michaelni@gmx.at> |
6 * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> | |
7 * | |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
8 * This file is part of FFmpeg. |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
9 * |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
10 * FFmpeg is free software; you can redistribute it and/or |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
11 * modify it under the terms of the GNU Lesser General Public |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
12 * License as published by the Free Software Foundation; either |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
13 * version 2.1 of the License, or (at your option) any later version. |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
14 * |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
15 * FFmpeg is distributed in the hope that it will be useful, |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
18 * Lesser General Public License for more details. |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
19 * |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
20 * You should have received a copy of the GNU Lesser General Public |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
21 * License along with FFmpeg; if not, write to the Free Software |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
23 */ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
24 |
5833
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
25 /* This header intentionally has no multiple inclusion guards. It is meant to |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
26 * be included multiple times and generates different code depending on the |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
27 * value of certain #defines. */ |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
28 |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
29 #define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0)) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
30 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
31 static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
32 { |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
5833
diff
changeset
|
33 x86_reg i=0; |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
34 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
35 assert(FFABS(scale) < MAX_ABS); |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
36 scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
37 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
38 SET_RND(mm6); |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
39 asm volatile( |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
40 "pxor %%mm7, %%mm7 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
41 "movd %4, %%mm5 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
42 "punpcklwd %%mm5, %%mm5 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
43 "punpcklwd %%mm5, %%mm5 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
44 ASMALIGN(4) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
45 "1: \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
46 "movq (%1, %0), %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
47 "movq 8(%1, %0), %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
48 PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
49 "paddw (%2, %0), %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
50 "paddw 8(%2, %0), %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
51 "psraw $6, %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
52 "psraw $6, %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
53 "pmullw (%3, %0), %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
54 "pmullw 8(%3, %0), %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
55 "pmaddwd %%mm0, %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
56 "pmaddwd %%mm1, %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
57 "paddd %%mm1, %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
58 "psrld $4, %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
59 "paddd %%mm0, %%mm7 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
60 "add $16, %0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
61 "cmp $128, %0 \n\t" //FIXME optimize & bench |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
62 " jb 1b \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
63 PHADDD(%%mm7, %%mm6) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
64 "psrld $2, %%mm7 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
65 "movd %%mm7, %0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
66 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
67 : "+r" (i) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
68 : "r"(basis), "r"(rem), "r"(weight), "g"(scale) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
69 ); |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
70 return i; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
71 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
72 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
73 static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
74 { |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
5833
diff
changeset
|
75 x86_reg i=0; |
5024
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
76 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
77 if(FFABS(scale) < MAX_ABS){ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
78 scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
79 SET_RND(mm6); |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
80 asm volatile( |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
81 "movd %3, %%mm5 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
82 "punpcklwd %%mm5, %%mm5 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
83 "punpcklwd %%mm5, %%mm5 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
84 ASMALIGN(4) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
85 "1: \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
86 "movq (%1, %0), %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
87 "movq 8(%1, %0), %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
88 PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
89 "paddw (%2, %0), %%mm0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
90 "paddw 8(%2, %0), %%mm1 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
91 "movq %%mm0, (%2, %0) \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
92 "movq %%mm1, 8(%2, %0) \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
93 "add $16, %0 \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
94 "cmp $128, %0 \n\t" // FIXME optimize & bench |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
95 " jb 1b \n\t" |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
96 |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
97 : "+r" (i) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
98 : "r"(basis), "r"(rem), "g"(scale) |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
99 ); |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
100 }else{ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
101 for(i=0; i<8*8; i++){ |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
102 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
103 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
104 } |
8a3bc96c366f
3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
zuxy
parents:
diff
changeset
|
105 } |