Mercurial > libavcodec.hg
annotate i386/fdct_mmx.c @ 3198:6b9f0c4fbdbe libavcodec
First part of a series of speed-enchancing patches.
This one sets up a snow.h and makes snow use the dsputil function pointer
framework to access the three functions that will be implemented in asm
in the other parts of the patchset.
Patch by Robert Edele < yartrebo AH earthlink POIS net>
Original thread:
Subject: [Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Date: Sun, 05 Feb 2006 12:47:14 -0500
author | gpoirier |
---|---|
date | Thu, 16 Mar 2006 19:18:18 +0000 |
parents | bfabfdf9ce55 |
children | 96f9bd6a9ea9 |
rev | line source |
---|---|
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
1 /* |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
2 * MMX optimized forward DCT |
429 | 3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1575
diff
changeset
|
4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
6 * |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT |
2967 | 8 * |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
9 * Intel Application Note AP-922 - fast, precise implementation of DCT |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
10 * http://developer.intel.com/vtune/cbts/appnotes.htm |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
11 * |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
12 * Also of inspiration: |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
15 */ |
2817
b128802eb77b
libavutil: Utility code from libavcodec moved to a separate library.
al
parents:
2293
diff
changeset
|
16 #include "common.h" |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1998
diff
changeset
|
17 #include "../dsputil.h" |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
18 #include "mmx.h" |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
19 |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
20 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
21 |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
22 ////////////////////////////////////////////////////////////////////// |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
23 // |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
24 // constants for the forward DCT |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
25 // ----------------------------- |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
26 // |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
27 // Be sure to check that your compiler is aligning all constants to QWORD |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
28 // (8-byte) memory boundaries! Otherwise the unaligned memory access will |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
29 // severely stall MMX execution. |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
30 // |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
31 ////////////////////////////////////////////////////////////////////// |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
32 |
2979 | 33 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy |
34 #define SHIFT_FRW_COL BITS_FRW_ACC | |
35 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) | |
36 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | |
37 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
38 |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
39 //concatenated table, for forward DCT transformation |
839 | 40 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { |
2979 | 41 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 |
42 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 | |
43 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
44 }; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
45 |
839 | 46 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { |
2979 | 47 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
48 }; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
49 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
50 static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
51 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
52 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
53 |
2967 | 54 struct |
1933
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
55 { |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2024
diff
changeset
|
56 const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16); |
1933
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
57 } fdct_r_row_sse2 ATTR_ALIGN(16)= |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
58 {{ |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
59 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
60 }}; |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
61 //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
62 |
839 | 63 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table |
2967 | 64 16384, 16384, 22725, 19266, |
65 16384, 16384, 12873, 4520, | |
66 21407, 8867, 19266, -4520, | |
67 -8867, -21407, -22725, -12873, | |
68 16384, -16384, 12873, -22725, | |
69 -16384, 16384, 4520, 19266, | |
70 8867, -21407, 4520, -12873, | |
71 21407, -8867, 19266, -22725, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
72 |
2967 | 73 22725, 22725, 31521, 26722, |
74 22725, 22725, 17855, 6270, | |
75 29692, 12299, 26722, -6270, | |
76 -12299, -29692, -31521, -17855, | |
77 22725, -22725, 17855, -31521, | |
78 -22725, 22725, 6270, 26722, | |
79 12299, -29692, 6270, -17855, | |
80 29692, -12299, 26722, -31521, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
81 |
2967 | 82 21407, 21407, 29692, 25172, |
83 21407, 21407, 16819, 5906, | |
84 27969, 11585, 25172, -5906, | |
85 -11585, -27969, -29692, -16819, | |
86 21407, -21407, 16819, -29692, | |
87 -21407, 21407, 5906, 25172, | |
88 11585, -27969, 5906, -16819, | |
89 27969, -11585, 25172, -29692, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
90 |
2967 | 91 19266, 19266, 26722, 22654, |
92 19266, 19266, 15137, 5315, | |
93 25172, 10426, 22654, -5315, | |
94 -10426, -25172, -26722, -15137, | |
95 19266, -19266, 15137, -26722, | |
96 -19266, 19266, 5315, 22654, | |
97 10426, -25172, 5315, -15137, | |
98 25172, -10426, 22654, -26722, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
99 |
2967 | 100 16384, 16384, 22725, 19266, |
101 16384, 16384, 12873, 4520, | |
102 21407, 8867, 19266, -4520, | |
103 -8867, -21407, -22725, -12873, | |
104 16384, -16384, 12873, -22725, | |
105 -16384, 16384, 4520, 19266, | |
106 8867, -21407, 4520, -12873, | |
107 21407, -8867, 19266, -22725, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
108 |
2967 | 109 19266, 19266, 26722, 22654, |
110 19266, 19266, 15137, 5315, | |
111 25172, 10426, 22654, -5315, | |
112 -10426, -25172, -26722, -15137, | |
113 19266, -19266, 15137, -26722, | |
114 -19266, 19266, 5315, 22654, | |
115 10426, -25172, 5315, -15137, | |
116 25172, -10426, 22654, -26722, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
117 |
2967 | 118 21407, 21407, 29692, 25172, |
119 21407, 21407, 16819, 5906, | |
120 27969, 11585, 25172, -5906, | |
121 -11585, -27969, -29692, -16819, | |
122 21407, -21407, 16819, -29692, | |
123 -21407, 21407, 5906, 25172, | |
124 11585, -27969, 5906, -16819, | |
125 27969, -11585, 25172, -29692, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
126 |
2967 | 127 22725, 22725, 31521, 26722, |
128 22725, 22725, 17855, 6270, | |
129 29692, 12299, 26722, -6270, | |
130 -12299, -29692, -31521, -17855, | |
131 22725, -22725, 17855, -31521, | |
132 -22725, 22725, 6270, 26722, | |
133 12299, -29692, 6270, -17855, | |
134 29692, -12299, 26722, -31521, | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
135 }; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
136 |
2967 | 137 struct |
1933
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
138 { |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
139 const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16); |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
140 } tab_frw_01234567_sse2 ATTR_ALIGN(16) = |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
141 {{ |
2967 | 142 //static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
143 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
144 C4, C4, C5, C7, C2, C6, C3, -C7, \ |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
145 -C4, C4, C7, C3, C6, -C2, C7, -C5, \ |
2967 | 146 C4, -C4, C5, -C1, C2, -C6, C3, -C1, |
147 // c1..c7 * cos(pi/4) * 2^15 | |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
148 #define C1 22725 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
149 #define C2 21407 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
150 #define C3 19266 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
151 #define C4 16384 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
152 #define C5 12873 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
153 #define C6 8867 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
154 #define C7 4520 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
155 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
156 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
157 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
158 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
159 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
160 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
161 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
162 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
163 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
164 #define C1 31521 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
165 #define C2 29692 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
166 #define C3 26722 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
167 #define C4 22725 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
168 #define C5 17855 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
169 #define C6 12299 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
170 #define C7 6270 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
171 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
172 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
173 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
174 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
175 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
176 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
177 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
178 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
179 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
180 #define C1 29692 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
181 #define C2 27969 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
182 #define C3 25172 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
183 #define C4 21407 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
184 #define C5 16819 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
185 #define C6 11585 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
186 #define C7 5906 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
187 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
188 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
189 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
190 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
191 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
192 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
193 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
194 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
195 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
196 #define C1 26722 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
197 #define C2 25172 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
198 #define C3 22654 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
199 #define C4 19266 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
200 #define C5 15137 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
201 #define C6 10426 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
202 #define C7 5315 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
203 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
204 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
205 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
206 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
207 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
208 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
209 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
210 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
211 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
212 #define C1 22725 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
213 #define C2 21407 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
214 #define C3 19266 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
215 #define C4 16384 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
216 #define C5 12873 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
217 #define C6 8867 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
218 #define C7 4520 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
219 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
220 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
221 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
222 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
223 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
224 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
225 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
226 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
227 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
228 #define C1 26722 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
229 #define C2 25172 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
230 #define C3 22654 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
231 #define C4 19266 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
232 #define C5 15137 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
233 #define C6 10426 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
234 #define C7 5315 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
235 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
236 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
237 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
238 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
239 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
240 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
241 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
242 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
243 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
244 #define C1 29692 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
245 #define C2 27969 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
246 #define C3 25172 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
247 #define C4 21407 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
248 #define C5 16819 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
249 #define C6 11585 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
250 #define C7 5906 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
251 TABLE_SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
252 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
253 #undef C1 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
254 #undef C2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
255 #undef C3 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
256 #undef C4 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
257 #undef C5 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
258 #undef C6 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
259 #undef C7 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
260 #define C1 31521 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
261 #define C2 29692 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
262 #define C3 26722 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
263 #define C4 22725 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
264 #define C5 17855 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
265 #define C6 12299 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
266 #define C7 6270 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
267 TABLE_SSE2 |
1933
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
268 }}; |
12408a3bf741
fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents:
1765
diff
changeset
|
269 |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
270 |
1564
b6b7d080f1a1
inline -> always_inline (842 -> 690 cpu cycles for dct_quantize() difference for the dct itself should be even bigger)
michael
parents:
839
diff
changeset
|
271 static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
272 { |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
273 movq_m2r(*(in + offset + 1 * 8), mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
274 movq_m2r(*(in + offset + 6 * 8), mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
275 movq_r2r(mm0, mm2); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
276 movq_m2r(*(in + offset + 2 * 8), mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
277 paddsw_r2r(mm1, mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
278 movq_m2r(*(in + offset + 5 * 8), mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
279 psllw_i2r(SHIFT_FRW_COL, mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
280 movq_m2r(*(in + offset + 0 * 8), mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
281 paddsw_r2r(mm3, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
282 paddsw_m2r(*(in + offset + 7 * 8), mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
283 psllw_i2r(SHIFT_FRW_COL, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
284 movq_r2r(mm0, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
285 psubsw_r2r(mm1, mm2); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
286 movq_m2r(*(fdct_tg_all_16 + 4), mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
287 psubsw_r2r(mm4, mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
288 movq_m2r(*(in + offset + 3 * 8), mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
289 pmulhw_r2r(mm0, mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
290 paddsw_m2r(*(in + offset + 4 * 8), mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
291 psllw_i2r(SHIFT_FRW_COL, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
292 paddsw_r2r(mm4, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
293 psllw_i2r(SHIFT_FRW_COL, mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
294 movq_r2r(mm5, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
295 psubsw_r2r(mm7, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
296 paddsw_r2r(mm5, mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
297 paddsw_r2r(mm7, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
298 por_m2r(fdct_one_corr, mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
299 psllw_i2r(SHIFT_FRW_COL + 1, mm2); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
300 pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
301 movq_r2r(mm4, mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
302 psubsw_m2r(*(in + offset + 5 * 8), mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
303 psubsw_r2r(mm6, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
304 movq_r2m(mm1, *(out + offset + 2 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
305 paddsw_r2r(mm6, mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
306 movq_m2r(*(in + offset + 3 * 8), mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
307 psllw_i2r(SHIFT_FRW_COL + 1, mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
308 psubsw_m2r(*(in + offset + 4 * 8), mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
309 movq_r2r(mm2, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
310 movq_r2m(mm4, *(out + offset + 4 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
311 paddsw_r2r(mm3, mm2); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
312 pmulhw_m2r(*ocos_4_16, mm2); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
313 psubsw_r2r(mm3, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
314 pmulhw_m2r(*ocos_4_16, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
315 psubsw_r2r(mm0, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
316 por_m2r(fdct_one_corr, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
317 psllw_i2r(SHIFT_FRW_COL, mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
318 por_m2r(fdct_one_corr, mm2); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
319 movq_r2r(mm1, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
320 movq_m2r(*(in + offset + 0 * 8), mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
321 paddsw_r2r(mm6, mm1); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
322 psubsw_m2r(*(in + offset + 7 * 8), mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
323 psubsw_r2r(mm6, mm4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
324 movq_m2r(*(fdct_tg_all_16 + 0), mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
325 psllw_i2r(SHIFT_FRW_COL, mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
326 movq_m2r(*(fdct_tg_all_16 + 8), mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
327 pmulhw_r2r(mm1, mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
328 movq_r2m(mm7, *(out + offset + 0 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
329 pmulhw_r2r(mm4, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
330 movq_r2m(mm5, *(out + offset + 6 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
331 movq_r2r(mm3, mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
332 movq_m2r(*(fdct_tg_all_16 + 8), mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
333 psubsw_r2r(mm2, mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
334 paddsw_r2r(mm2, mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
335 pmulhw_r2r(mm7, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
336 paddsw_r2r(mm3, mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
337 paddsw_r2r(mm4, mm6); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
338 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
339 por_m2r(fdct_one_corr, mm0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
340 paddsw_r2r(mm7, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
341 psubsw_r2r(mm6, mm7); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
342 movq_r2m(mm0, *(out + offset + 1 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
343 paddsw_r2r(mm4, mm5); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
344 movq_r2m(mm7, *(out + offset + 3 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
345 psubsw_r2r(mm1, mm3); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
346 movq_r2m(mm5, *(out + offset + 5 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
347 movq_r2m(mm3, *(out + offset + 7 * 8)); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
348 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
349 |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
350 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
351 static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
352 { |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
353 asm volatile( |
2979 | 354 ".macro FDCT_ROW_SSE2_H1 i t \n\t" |
355 "movq \\i(%0), %%xmm2 \n\t" | |
356 "movq \\i+8(%0), %%xmm0 \n\t" | |
357 "movdqa \\t+32(%1), %%xmm3 \n\t" | |
358 "movdqa \\t+48(%1), %%xmm7 \n\t" | |
359 "movdqa \\t(%1), %%xmm4 \n\t" | |
360 "movdqa \\t+16(%1), %%xmm5 \n\t" | |
361 ".endm \n\t" | |
362 ".macro FDCT_ROW_SSE2_H2 i t \n\t" | |
363 "movq \\i(%0), %%xmm2 \n\t" | |
364 "movq \\i+8(%0), %%xmm0 \n\t" | |
365 "movdqa \\t+32(%1), %%xmm3 \n\t" | |
366 "movdqa \\t+48(%1), %%xmm7 \n\t" | |
367 ".endm \n\t" | |
368 ".macro FDCT_ROW_SSE2 i \n\t" | |
369 "movq %%xmm2, %%xmm1 \n\t" | |
370 "pshuflw $27, %%xmm0, %%xmm0 \n\t" | |
371 "paddsw %%xmm0, %%xmm1 \n\t" | |
372 "psubsw %%xmm0, %%xmm2 \n\t" | |
373 "punpckldq %%xmm2, %%xmm1 \n\t" | |
374 "pshufd $78, %%xmm1, %%xmm2 \n\t" | |
375 "pmaddwd %%xmm2, %%xmm3 \n\t" | |
376 "pmaddwd %%xmm1, %%xmm7 \n\t" | |
377 "pmaddwd %%xmm5, %%xmm2 \n\t" | |
378 "pmaddwd %%xmm4, %%xmm1 \n\t" | |
379 "paddd %%xmm7, %%xmm3 \n\t" | |
380 "paddd %%xmm2, %%xmm1 \n\t" | |
381 "paddd %%xmm6, %%xmm3 \n\t" | |
382 "paddd %%xmm6, %%xmm1 \n\t" | |
383 "psrad %3, %%xmm3 \n\t" | |
384 "psrad %3, %%xmm1 \n\t" | |
385 "packssdw %%xmm3, %%xmm1 \n\t" | |
386 "movdqa %%xmm1, \\i(%4) \n\t" | |
387 ".endm \n\t" | |
388 "movdqa (%2), %%xmm6 \n\t" | |
389 "FDCT_ROW_SSE2_H1 0 0 \n\t" | |
390 "FDCT_ROW_SSE2 0 \n\t" | |
391 "FDCT_ROW_SSE2_H2 64 0 \n\t" | |
392 "FDCT_ROW_SSE2 64 \n\t" | |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
393 |
2979 | 394 "FDCT_ROW_SSE2_H1 16 64 \n\t" |
395 "FDCT_ROW_SSE2 16 \n\t" | |
396 "FDCT_ROW_SSE2_H2 112 64 \n\t" | |
397 "FDCT_ROW_SSE2 112 \n\t" | |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
398 |
2979 | 399 "FDCT_ROW_SSE2_H1 32 128 \n\t" |
400 "FDCT_ROW_SSE2 32 \n\t" | |
401 "FDCT_ROW_SSE2_H2 96 128 \n\t" | |
402 "FDCT_ROW_SSE2 96 \n\t" | |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
403 |
2979 | 404 "FDCT_ROW_SSE2_H1 48 192 \n\t" |
405 "FDCT_ROW_SSE2 48 \n\t" | |
406 "FDCT_ROW_SSE2_H2 80 192 \n\t" | |
407 "FDCT_ROW_SSE2 80 \n\t" | |
408 : | |
409 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) | |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
410 ); |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
411 } |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
412 |
1570 | 413 static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) |
2967 | 414 { |
1565 | 415 pshufw_m2r(*(in + 4), mm5, 0x1B); |
416 movq_m2r(*(in + 0), mm0); | |
1998 | 417 movq_r2r(mm0, mm1); |
1570 | 418 paddsw_r2r(mm5, mm0); |
419 psubsw_r2r(mm5, mm1); | |
1998 | 420 movq_r2r(mm0, mm2); |
421 punpckldq_r2r(mm1, mm0); | |
422 punpckhdq_r2r(mm1, mm2); | |
423 movq_m2r(*(table + 0), mm1); | |
424 movq_m2r(*(table + 4), mm3); | |
425 movq_m2r(*(table + 8), mm4); | |
426 movq_m2r(*(table + 12), mm5); | |
427 movq_m2r(*(table + 16), mm6); | |
1570 | 428 movq_m2r(*(table + 20), mm7); |
1998 | 429 pmaddwd_r2r(mm0, mm1); |
430 pmaddwd_r2r(mm2, mm3); | |
1575
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
431 pmaddwd_r2r(mm0, mm4); |
1998 | 432 pmaddwd_r2r(mm2, mm5); |
433 pmaddwd_r2r(mm0, mm6); | |
434 pmaddwd_r2r(mm2, mm7); | |
435 pmaddwd_m2r(*(table + 24), mm0); | |
436 pmaddwd_m2r(*(table + 28), mm2); | |
437 paddd_r2r(mm1, mm3); | |
438 paddd_r2r(mm4, mm5); | |
439 paddd_r2r(mm6, mm7); | |
440 paddd_r2r(mm0, mm2); | |
441 movq_m2r(*fdct_r_row, mm0); | |
442 paddd_r2r(mm0, mm3); | |
443 paddd_r2r(mm0, mm5); | |
444 paddd_r2r(mm0, mm7); | |
445 paddd_r2r(mm0, mm2); | |
446 psrad_i2r(SHIFT_FRW_ROW, mm3); | |
1570 | 447 psrad_i2r(SHIFT_FRW_ROW, mm5); |
1998 | 448 psrad_i2r(SHIFT_FRW_ROW, mm7); |
449 psrad_i2r(SHIFT_FRW_ROW, mm2); | |
450 packssdw_r2r(mm5, mm3); | |
451 packssdw_r2r(mm2, mm7); | |
452 movq_r2m(mm3, *(out + 0)); | |
453 movq_r2m(mm7, *(out + 4)); | |
1570 | 454 } |
455 | |
456 static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) | |
2967 | 457 { |
1998 | 458 //FIXME reorder (i dont have a old mmx only cpu here to benchmark ...) |
1575
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
459 movd_m2r(*(in + 6), mm1); |
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
460 punpcklwd_m2r(*(in + 4), mm1); |
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
461 movq_r2r(mm1, mm2); |
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
462 psrlq_i2r(0x20, mm1); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
463 movq_m2r(*(in + 0), mm0); |
1575
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
464 punpcklwd_r2r(mm2, mm1); |
1998 | 465 movq_r2r(mm0, mm5); |
1575
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
466 paddsw_r2r(mm1, mm0); |
f16ae8e69bd9
reorder table instead of wasting instructions to reorder the input to match the table
michael
parents:
1574
diff
changeset
|
467 psubsw_r2r(mm1, mm5); |
1998 | 468 movq_r2r(mm0, mm2); |
469 punpckldq_r2r(mm5, mm0); | |
470 punpckhdq_r2r(mm5, mm2); | |
471 movq_m2r(*(table + 0), mm1); | |
472 movq_m2r(*(table + 4), mm3); | |
473 movq_m2r(*(table + 8), mm4); | |
474 movq_m2r(*(table + 12), mm5); | |
475 movq_m2r(*(table + 16), mm6); | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
476 movq_m2r(*(table + 20), mm7); |
1998 | 477 pmaddwd_r2r(mm0, mm1); |
478 pmaddwd_r2r(mm2, mm3); | |
479 pmaddwd_r2r(mm0, mm4); | |
480 pmaddwd_r2r(mm2, mm5); | |
481 pmaddwd_r2r(mm0, mm6); | |
482 pmaddwd_r2r(mm2, mm7); | |
483 pmaddwd_m2r(*(table + 24), mm0); | |
484 pmaddwd_m2r(*(table + 28), mm2); | |
485 paddd_r2r(mm1, mm3); | |
486 paddd_r2r(mm4, mm5); | |
487 paddd_r2r(mm6, mm7); | |
488 paddd_r2r(mm0, mm2); | |
489 movq_m2r(*fdct_r_row, mm0); | |
490 paddd_r2r(mm0, mm3); | |
491 paddd_r2r(mm0, mm5); | |
492 paddd_r2r(mm0, mm7); | |
493 paddd_r2r(mm0, mm2); | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
494 psrad_i2r(SHIFT_FRW_ROW, mm3); |
1998 | 495 psrad_i2r(SHIFT_FRW_ROW, mm5); |
496 psrad_i2r(SHIFT_FRW_ROW, mm7); | |
1570 | 497 psrad_i2r(SHIFT_FRW_ROW, mm2); |
1998 | 498 packssdw_r2r(mm5, mm3); |
499 packssdw_r2r(mm2, mm7); | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
500 movq_r2m(mm3, *(out + 0)); |
1998 | 501 movq_r2m(mm7, *(out + 4)); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
502 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
503 |
687
9abb13c21fbe
fdct_mmx -> ff_fdct_mmx (renamed to avoid namespace conflict with xvid)
arpi_esp
parents:
635
diff
changeset
|
504 void ff_fdct_mmx(int16_t *block) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
505 { |
839 | 506 int64_t align_tmp[16] ATTR_ALIGN(8); |
507 int16_t * const block_tmp= (int16_t*)align_tmp; | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
508 int16_t *block1, *out; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
509 const int16_t *table; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
510 int i; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
511 |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
512 block1 = block_tmp; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
513 fdct_col(block, block1, 0); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
514 fdct_col(block, block1, 4); |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
515 |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
516 block1 = block_tmp; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
517 table = tab_frw_01234567; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
518 out = block; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
519 for(i=8;i>0;i--) { |
1570 | 520 fdct_row_mmx(block1, out, table); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
521 block1 += 8; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
522 table += 32; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
523 out += 8; |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
524 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
525 } |
1565 | 526 |
527 void ff_fdct_mmx2(int16_t *block) | |
528 { | |
529 int64_t align_tmp[16] ATTR_ALIGN(8); | |
530 int16_t * const block_tmp= (int16_t*)align_tmp; | |
531 int16_t *block1, *out; | |
532 const int16_t *table; | |
533 int i; | |
534 | |
535 block1 = block_tmp; | |
536 fdct_col(block, block1, 0); | |
537 fdct_col(block, block1, 4); | |
538 | |
539 block1 = block_tmp; | |
540 table = tab_frw_01234567; | |
541 out = block; | |
542 for(i=8;i>0;i--) { | |
1570 | 543 fdct_row_mmx2(block1, out, table); |
1565 | 544 block1 += 8; |
545 table += 32; | |
546 out += 8; | |
547 } | |
548 } | |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
549 |
2967 | 550 void ff_fdct_sse2(int16_t *block) |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
551 { |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
552 int64_t align_tmp[16] ATTR_ALIGN(8); |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
553 int16_t * const block_tmp= (int16_t*)align_tmp; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
554 int16_t *block1; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
555 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
556 block1 = block_tmp; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
557 fdct_col(block, block1, 0); |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
558 fdct_col(block, block1, 4); |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
559 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
560 fdct_row_sse2(block1, block); |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
561 } |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
562 |