annotate i386/fdct_mmx.c @ 3198:6b9f0c4fbdbe libavcodec

First part of a series of speed-enchancing patches. This one sets up a snow.h and makes snow use the dsputil function pointer framework to access the three functions that will be implemented in asm in the other parts of the patchset. Patch by Robert Edele < yartrebo AH earthlink POIS net> Original thread: Subject: [Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations Date: Sun, 05 Feb 2006 12:47:14 -0500
author gpoirier
date Thu, 16 Mar 2006 19:18:18 +0000
parents bfabfdf9ce55
children 96f9bd6a9ea9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
1 /*
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
2 * MMX optimized forward DCT
429
718a22dc121f license/copyright change
glantau
parents: 72
diff changeset
3 * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
1739
07a484280a82 copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents: 1575
diff changeset
4 * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
5 * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
6 *
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
7 * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
8 *
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
9 * Intel Application Note AP-922 - fast, precise implementation of DCT
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
10 * http://developer.intel.com/vtune/cbts/appnotes.htm
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
11 *
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
12 * Also of inspiration:
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
13 * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
14 * Skal's fdct at http://skal.planet-d.net/coding/dct.html
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
15 */
2817
b128802eb77b libavutil: Utility code from libavcodec moved to a separate library.
al
parents: 2293
diff changeset
16 #include "common.h"
2024
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents: 1998
diff changeset
17 #include "../dsputil.h"
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
18 #include "mmx.h"
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
19
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
20 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
21
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
22 //////////////////////////////////////////////////////////////////////
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
23 //
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
24 // constants for the forward DCT
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
25 // -----------------------------
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
26 //
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
27 // Be sure to check that your compiler is aligning all constants to QWORD
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
28 // (8-byte) memory boundaries! Otherwise the unaligned memory access will
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
29 // severely stall MMX execution.
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
30 //
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
31 //////////////////////////////////////////////////////////////////////
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
32
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
33 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
34 #define SHIFT_FRW_COL BITS_FRW_ACC
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
35 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
36 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
37 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
38
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
39 //concatenated table, for forward DCT transformation
839
b7e2b8129211 cleanup
michaelni
parents: 687
diff changeset
40 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
41 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
42 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
43 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
44 };
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
45
839
b7e2b8129211 cleanup
michaelni
parents: 687
diff changeset
46 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
47 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
48 };
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
49
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2024
diff changeset
50 static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
51
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2024
diff changeset
52 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
53
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
54 struct
1933
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
55 {
2293
15cfba1b97b5 adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents: 2024
diff changeset
56 const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
1933
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
57 } fdct_r_row_sse2 ATTR_ALIGN(16)=
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
58 {{
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
59 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
60 }};
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
61 //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
62
839
b7e2b8129211 cleanup
michaelni
parents: 687
diff changeset
63 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
64 16384, 16384, 22725, 19266,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
65 16384, 16384, 12873, 4520,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
66 21407, 8867, 19266, -4520,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
67 -8867, -21407, -22725, -12873,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
68 16384, -16384, 12873, -22725,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
69 -16384, 16384, 4520, 19266,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
70 8867, -21407, 4520, -12873,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
71 21407, -8867, 19266, -22725,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
72
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
73 22725, 22725, 31521, 26722,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
74 22725, 22725, 17855, 6270,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
75 29692, 12299, 26722, -6270,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
76 -12299, -29692, -31521, -17855,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
77 22725, -22725, 17855, -31521,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
78 -22725, 22725, 6270, 26722,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
79 12299, -29692, 6270, -17855,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
80 29692, -12299, 26722, -31521,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
81
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
82 21407, 21407, 29692, 25172,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
83 21407, 21407, 16819, 5906,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
84 27969, 11585, 25172, -5906,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
85 -11585, -27969, -29692, -16819,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
86 21407, -21407, 16819, -29692,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
87 -21407, 21407, 5906, 25172,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
88 11585, -27969, 5906, -16819,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
89 27969, -11585, 25172, -29692,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
90
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
91 19266, 19266, 26722, 22654,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
92 19266, 19266, 15137, 5315,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
93 25172, 10426, 22654, -5315,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
94 -10426, -25172, -26722, -15137,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
95 19266, -19266, 15137, -26722,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
96 -19266, 19266, 5315, 22654,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
97 10426, -25172, 5315, -15137,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
98 25172, -10426, 22654, -26722,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
99
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
100 16384, 16384, 22725, 19266,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
101 16384, 16384, 12873, 4520,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
102 21407, 8867, 19266, -4520,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
103 -8867, -21407, -22725, -12873,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
104 16384, -16384, 12873, -22725,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
105 -16384, 16384, 4520, 19266,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
106 8867, -21407, 4520, -12873,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
107 21407, -8867, 19266, -22725,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
108
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
109 19266, 19266, 26722, 22654,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
110 19266, 19266, 15137, 5315,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
111 25172, 10426, 22654, -5315,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
112 -10426, -25172, -26722, -15137,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
113 19266, -19266, 15137, -26722,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
114 -19266, 19266, 5315, 22654,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
115 10426, -25172, 5315, -15137,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
116 25172, -10426, 22654, -26722,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
117
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
118 21407, 21407, 29692, 25172,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
119 21407, 21407, 16819, 5906,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
120 27969, 11585, 25172, -5906,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
121 -11585, -27969, -29692, -16819,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
122 21407, -21407, 16819, -29692,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
123 -21407, 21407, 5906, 25172,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
124 11585, -27969, 5906, -16819,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
125 27969, -11585, 25172, -29692,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
126
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
127 22725, 22725, 31521, 26722,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
128 22725, 22725, 17855, 6270,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
129 29692, 12299, 26722, -6270,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
130 -12299, -29692, -31521, -17855,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
131 22725, -22725, 17855, -31521,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
132 -22725, 22725, 6270, 26722,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
133 12299, -29692, 6270, -17855,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
134 29692, -12299, 26722, -31521,
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
135 };
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
136
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
137 struct
1933
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
138 {
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
139 const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
140 } tab_frw_01234567_sse2 ATTR_ALIGN(16) =
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
141 {{
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
142 //static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
143 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
144 C4, C4, C5, C7, C2, C6, C3, -C7, \
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
145 -C4, C4, C7, C3, C6, -C2, C7, -C5, \
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
146 C4, -C4, C5, -C1, C2, -C6, C3, -C1,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
147 // c1..c7 * cos(pi/4) * 2^15
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
148 #define C1 22725
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
149 #define C2 21407
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
150 #define C3 19266
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
151 #define C4 16384
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
152 #define C5 12873
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
153 #define C6 8867
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
154 #define C7 4520
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
155 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
156
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
157 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
158 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
159 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
160 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
161 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
162 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
163 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
164 #define C1 31521
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
165 #define C2 29692
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
166 #define C3 26722
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
167 #define C4 22725
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
168 #define C5 17855
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
169 #define C6 12299
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
170 #define C7 6270
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
171 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
172
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
173 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
174 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
175 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
176 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
177 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
178 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
179 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
180 #define C1 29692
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
181 #define C2 27969
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
182 #define C3 25172
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
183 #define C4 21407
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
184 #define C5 16819
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
185 #define C6 11585
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
186 #define C7 5906
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
187 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
188
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
189 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
190 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
191 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
192 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
193 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
194 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
195 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
196 #define C1 26722
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
197 #define C2 25172
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
198 #define C3 22654
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
199 #define C4 19266
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
200 #define C5 15137
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
201 #define C6 10426
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
202 #define C7 5315
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
203 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
204
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
205 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
206 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
207 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
208 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
209 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
210 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
211 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
212 #define C1 22725
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
213 #define C2 21407
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
214 #define C3 19266
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
215 #define C4 16384
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
216 #define C5 12873
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
217 #define C6 8867
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
218 #define C7 4520
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
219 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
220
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
221 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
222 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
223 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
224 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
225 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
226 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
227 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
228 #define C1 26722
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
229 #define C2 25172
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
230 #define C3 22654
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
231 #define C4 19266
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
232 #define C5 15137
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
233 #define C6 10426
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
234 #define C7 5315
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
235 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
236
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
237 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
238 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
239 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
240 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
241 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
242 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
243 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
244 #define C1 29692
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
245 #define C2 27969
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
246 #define C3 25172
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
247 #define C4 21407
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
248 #define C5 16819
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
249 #define C6 11585
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
250 #define C7 5906
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
251 TABLE_SSE2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
252
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
253 #undef C1
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
254 #undef C2
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
255 #undef C3
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
256 #undef C4
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
257 #undef C5
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
258 #undef C6
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
259 #undef C7
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
260 #define C1 31521
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
261 #define C2 29692
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
262 #define C3 26722
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
263 #define C4 22725
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
264 #define C5 17855
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
265 #define C6 12299
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
266 #define C7 6270
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
267 TABLE_SSE2
1933
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
268 }};
12408a3bf741 fixing alignment problems -> SSE2 support enabled again in libavcodec (from ffdshow / milan_cutka)
michael
parents: 1765
diff changeset
269
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
270
1564
b6b7d080f1a1 inline -> always_inline (842 -> 690 cpu cycles for dct_quantize() difference for the dct itself should be even bigger)
michael
parents: 839
diff changeset
271 static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
272 {
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
273 movq_m2r(*(in + offset + 1 * 8), mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
274 movq_m2r(*(in + offset + 6 * 8), mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
275 movq_r2r(mm0, mm2);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
276 movq_m2r(*(in + offset + 2 * 8), mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
277 paddsw_r2r(mm1, mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
278 movq_m2r(*(in + offset + 5 * 8), mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
279 psllw_i2r(SHIFT_FRW_COL, mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
280 movq_m2r(*(in + offset + 0 * 8), mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
281 paddsw_r2r(mm3, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
282 paddsw_m2r(*(in + offset + 7 * 8), mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
283 psllw_i2r(SHIFT_FRW_COL, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
284 movq_r2r(mm0, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
285 psubsw_r2r(mm1, mm2);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
286 movq_m2r(*(fdct_tg_all_16 + 4), mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
287 psubsw_r2r(mm4, mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
288 movq_m2r(*(in + offset + 3 * 8), mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
289 pmulhw_r2r(mm0, mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
290 paddsw_m2r(*(in + offset + 4 * 8), mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
291 psllw_i2r(SHIFT_FRW_COL, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
292 paddsw_r2r(mm4, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
293 psllw_i2r(SHIFT_FRW_COL, mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
294 movq_r2r(mm5, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
295 psubsw_r2r(mm7, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
296 paddsw_r2r(mm5, mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
297 paddsw_r2r(mm7, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
298 por_m2r(fdct_one_corr, mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
299 psllw_i2r(SHIFT_FRW_COL + 1, mm2);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
300 pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
301 movq_r2r(mm4, mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
302 psubsw_m2r(*(in + offset + 5 * 8), mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
303 psubsw_r2r(mm6, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
304 movq_r2m(mm1, *(out + offset + 2 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
305 paddsw_r2r(mm6, mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
306 movq_m2r(*(in + offset + 3 * 8), mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
307 psllw_i2r(SHIFT_FRW_COL + 1, mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
308 psubsw_m2r(*(in + offset + 4 * 8), mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
309 movq_r2r(mm2, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
310 movq_r2m(mm4, *(out + offset + 4 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
311 paddsw_r2r(mm3, mm2);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
312 pmulhw_m2r(*ocos_4_16, mm2);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
313 psubsw_r2r(mm3, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
314 pmulhw_m2r(*ocos_4_16, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
315 psubsw_r2r(mm0, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
316 por_m2r(fdct_one_corr, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
317 psllw_i2r(SHIFT_FRW_COL, mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
318 por_m2r(fdct_one_corr, mm2);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
319 movq_r2r(mm1, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
320 movq_m2r(*(in + offset + 0 * 8), mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
321 paddsw_r2r(mm6, mm1);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
322 psubsw_m2r(*(in + offset + 7 * 8), mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
323 psubsw_r2r(mm6, mm4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
324 movq_m2r(*(fdct_tg_all_16 + 0), mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
325 psllw_i2r(SHIFT_FRW_COL, mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
326 movq_m2r(*(fdct_tg_all_16 + 8), mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
327 pmulhw_r2r(mm1, mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
328 movq_r2m(mm7, *(out + offset + 0 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
329 pmulhw_r2r(mm4, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
330 movq_r2m(mm5, *(out + offset + 6 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
331 movq_r2r(mm3, mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
332 movq_m2r(*(fdct_tg_all_16 + 8), mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
333 psubsw_r2r(mm2, mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
334 paddsw_r2r(mm2, mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
335 pmulhw_r2r(mm7, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
336 paddsw_r2r(mm3, mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
337 paddsw_r2r(mm4, mm6);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
338 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
339 por_m2r(fdct_one_corr, mm0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
340 paddsw_r2r(mm7, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
341 psubsw_r2r(mm6, mm7);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
342 movq_r2m(mm0, *(out + offset + 1 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
343 paddsw_r2r(mm4, mm5);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
344 movq_r2m(mm7, *(out + offset + 3 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
345 psubsw_r2r(mm1, mm3);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
346 movq_r2m(mm5, *(out + offset + 5 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
347 movq_r2m(mm3, *(out + offset + 7 * 8));
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
348 }
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
349
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
350
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
351 static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
352 {
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
353 asm volatile(
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
354 ".macro FDCT_ROW_SSE2_H1 i t \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
355 "movq \\i(%0), %%xmm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
356 "movq \\i+8(%0), %%xmm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
357 "movdqa \\t+32(%1), %%xmm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
358 "movdqa \\t+48(%1), %%xmm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
359 "movdqa \\t(%1), %%xmm4 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
360 "movdqa \\t+16(%1), %%xmm5 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
361 ".endm \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
362 ".macro FDCT_ROW_SSE2_H2 i t \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
363 "movq \\i(%0), %%xmm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
364 "movq \\i+8(%0), %%xmm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
365 "movdqa \\t+32(%1), %%xmm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
366 "movdqa \\t+48(%1), %%xmm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
367 ".endm \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
368 ".macro FDCT_ROW_SSE2 i \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
369 "movq %%xmm2, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
370 "pshuflw $27, %%xmm0, %%xmm0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
371 "paddsw %%xmm0, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
372 "psubsw %%xmm0, %%xmm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
373 "punpckldq %%xmm2, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
374 "pshufd $78, %%xmm1, %%xmm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
375 "pmaddwd %%xmm2, %%xmm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
376 "pmaddwd %%xmm1, %%xmm7 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
377 "pmaddwd %%xmm5, %%xmm2 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
378 "pmaddwd %%xmm4, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
379 "paddd %%xmm7, %%xmm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
380 "paddd %%xmm2, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
381 "paddd %%xmm6, %%xmm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
382 "paddd %%xmm6, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
383 "psrad %3, %%xmm3 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
384 "psrad %3, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
385 "packssdw %%xmm3, %%xmm1 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
386 "movdqa %%xmm1, \\i(%4) \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
387 ".endm \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
388 "movdqa (%2), %%xmm6 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
389 "FDCT_ROW_SSE2_H1 0 0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
390 "FDCT_ROW_SSE2 0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
391 "FDCT_ROW_SSE2_H2 64 0 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
392 "FDCT_ROW_SSE2 64 \n\t"
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
393
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
394 "FDCT_ROW_SSE2_H1 16 64 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
395 "FDCT_ROW_SSE2 16 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
396 "FDCT_ROW_SSE2_H2 112 64 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
397 "FDCT_ROW_SSE2 112 \n\t"
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
398
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
399 "FDCT_ROW_SSE2_H1 32 128 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
400 "FDCT_ROW_SSE2 32 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
401 "FDCT_ROW_SSE2_H2 96 128 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
402 "FDCT_ROW_SSE2 96 \n\t"
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
403
2979
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
404 "FDCT_ROW_SSE2_H1 48 192 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
405 "FDCT_ROW_SSE2 48 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
406 "FDCT_ROW_SSE2_H2 80 192 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
407 "FDCT_ROW_SSE2 80 \n\t"
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
408 :
bfabfdf9ce55 COSMETICS: tabs --> spaces, some prettyprinting
diego
parents: 2967
diff changeset
409 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
410 );
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
411 }
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
412
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
413 static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
414 {
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
415 pshufw_m2r(*(in + 4), mm5, 0x1B);
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
416 movq_m2r(*(in + 0), mm0);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
417 movq_r2r(mm0, mm1);
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
418 paddsw_r2r(mm5, mm0);
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
419 psubsw_r2r(mm5, mm1);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
420 movq_r2r(mm0, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
421 punpckldq_r2r(mm1, mm0);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
422 punpckhdq_r2r(mm1, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
423 movq_m2r(*(table + 0), mm1);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
424 movq_m2r(*(table + 4), mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
425 movq_m2r(*(table + 8), mm4);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
426 movq_m2r(*(table + 12), mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
427 movq_m2r(*(table + 16), mm6);
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
428 movq_m2r(*(table + 20), mm7);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
429 pmaddwd_r2r(mm0, mm1);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
430 pmaddwd_r2r(mm2, mm3);
1575
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
431 pmaddwd_r2r(mm0, mm4);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
432 pmaddwd_r2r(mm2, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
433 pmaddwd_r2r(mm0, mm6);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
434 pmaddwd_r2r(mm2, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
435 pmaddwd_m2r(*(table + 24), mm0);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
436 pmaddwd_m2r(*(table + 28), mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
437 paddd_r2r(mm1, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
438 paddd_r2r(mm4, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
439 paddd_r2r(mm6, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
440 paddd_r2r(mm0, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
441 movq_m2r(*fdct_r_row, mm0);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
442 paddd_r2r(mm0, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
443 paddd_r2r(mm0, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
444 paddd_r2r(mm0, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
445 paddd_r2r(mm0, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
446 psrad_i2r(SHIFT_FRW_ROW, mm3);
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
447 psrad_i2r(SHIFT_FRW_ROW, mm5);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
448 psrad_i2r(SHIFT_FRW_ROW, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
449 psrad_i2r(SHIFT_FRW_ROW, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
450 packssdw_r2r(mm5, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
451 packssdw_r2r(mm2, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
452 movq_r2m(mm3, *(out + 0));
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
453 movq_r2m(mm7, *(out + 4));
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
454 }
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
455
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
456 static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
457 {
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
458 //FIXME reorder (i dont have a old mmx only cpu here to benchmark ...)
1575
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
459 movd_m2r(*(in + 6), mm1);
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
460 punpcklwd_m2r(*(in + 4), mm1);
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
461 movq_r2r(mm1, mm2);
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
462 psrlq_i2r(0x20, mm1);
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
463 movq_m2r(*(in + 0), mm0);
1575
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
464 punpcklwd_r2r(mm2, mm1);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
465 movq_r2r(mm0, mm5);
1575
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
466 paddsw_r2r(mm1, mm0);
f16ae8e69bd9 reorder table instead of wasting instructions to reorder the input to match the table
michael
parents: 1574
diff changeset
467 psubsw_r2r(mm1, mm5);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
468 movq_r2r(mm0, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
469 punpckldq_r2r(mm5, mm0);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
470 punpckhdq_r2r(mm5, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
471 movq_m2r(*(table + 0), mm1);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
472 movq_m2r(*(table + 4), mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
473 movq_m2r(*(table + 8), mm4);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
474 movq_m2r(*(table + 12), mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
475 movq_m2r(*(table + 16), mm6);
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
476 movq_m2r(*(table + 20), mm7);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
477 pmaddwd_r2r(mm0, mm1);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
478 pmaddwd_r2r(mm2, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
479 pmaddwd_r2r(mm0, mm4);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
480 pmaddwd_r2r(mm2, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
481 pmaddwd_r2r(mm0, mm6);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
482 pmaddwd_r2r(mm2, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
483 pmaddwd_m2r(*(table + 24), mm0);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
484 pmaddwd_m2r(*(table + 28), mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
485 paddd_r2r(mm1, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
486 paddd_r2r(mm4, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
487 paddd_r2r(mm6, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
488 paddd_r2r(mm0, mm2);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
489 movq_m2r(*fdct_r_row, mm0);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
490 paddd_r2r(mm0, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
491 paddd_r2r(mm0, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
492 paddd_r2r(mm0, mm7);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
493 paddd_r2r(mm0, mm2);
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
494 psrad_i2r(SHIFT_FRW_ROW, mm3);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
495 psrad_i2r(SHIFT_FRW_ROW, mm5);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
496 psrad_i2r(SHIFT_FRW_ROW, mm7);
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
497 psrad_i2r(SHIFT_FRW_ROW, mm2);
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
498 packssdw_r2r(mm5, mm3);
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
499 packssdw_r2r(mm2, mm7);
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
500 movq_r2m(mm3, *(out + 0));
1998
5bc1a9ad6c33 mmx dct optimization
michael
parents: 1933
diff changeset
501 movq_r2m(mm7, *(out + 4));
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
502 }
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
503
687
9abb13c21fbe fdct_mmx -> ff_fdct_mmx (renamed to avoid namespace conflict with xvid)
arpi_esp
parents: 635
diff changeset
504 void ff_fdct_mmx(int16_t *block)
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
505 {
839
b7e2b8129211 cleanup
michaelni
parents: 687
diff changeset
506 int64_t align_tmp[16] ATTR_ALIGN(8);
b7e2b8129211 cleanup
michaelni
parents: 687
diff changeset
507 int16_t * const block_tmp= (int16_t*)align_tmp;
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
508 int16_t *block1, *out;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
509 const int16_t *table;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
510 int i;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
511
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
512 block1 = block_tmp;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
513 fdct_col(block, block1, 0);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
514 fdct_col(block, block1, 4);
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
515
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
516 block1 = block_tmp;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
517 table = tab_frw_01234567;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
518 out = block;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
519 for(i=8;i>0;i--) {
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
520 fdct_row_mmx(block1, out, table);
72
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
521 block1 += 8;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
522 table += 32;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
523 out += 8;
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
524 }
3049d6d452a3 suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff changeset
525 }
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
526
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
527 void ff_fdct_mmx2(int16_t *block)
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
528 {
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
529 int64_t align_tmp[16] ATTR_ALIGN(8);
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
530 int16_t * const block_tmp= (int16_t*)align_tmp;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
531 int16_t *block1, *out;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
532 const int16_t *table;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
533 int i;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
534
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
535 block1 = block_tmp;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
536 fdct_col(block, block1, 0);
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
537 fdct_col(block, block1, 4);
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
538
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
539 block1 = block_tmp;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
540 table = tab_frw_01234567;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
541 out = block;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
542 for(i=8;i>0;i--) {
1570
9a9c14e87ebf optimizing
michael
parents: 1565
diff changeset
543 fdct_row_mmx2(block1, out, table);
1565
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
544 block1 += 8;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
545 table += 32;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
546 out += 8;
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
547 }
1a9a63f59849 minor mmx2 optimization if the dct
michael
parents: 1564
diff changeset
548 }
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
549
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2817
diff changeset
550 void ff_fdct_sse2(int16_t *block)
1765
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
551 {
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
552 int64_t align_tmp[16] ATTR_ALIGN(8);
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
553 int16_t * const block_tmp= (int16_t*)align_tmp;
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
554 int16_t *block1;
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
555
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
556 block1 = block_tmp;
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
557 fdct_col(block, block1, 0);
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
558 fdct_col(block, block1, 4);
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
559
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
560 fdct_row_sse2(block1, block);
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
561 }
e31754bc5b65 SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents: 1739
diff changeset
562