Mercurial > mplayer.hg
annotate mp3lib/dct64_k7.s @ 1277:3a9699d9e7da
Slight otimization
author | nick |
---|---|
date | Thu, 05 Jul 2001 09:44:18 +0000 |
parents | 2864e32cd267 |
children | 175423b2691e |
rev | line source |
---|---|
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
1 # This code was taken from http://www.mpg123.org |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
4 # Partial 3dnowex-DSP! optimization by Nick Kurshev |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
5 # |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
6 # TODO: optimize scalar 3dnow! code |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
7 # Warning: Phases 7 & 8 are not tested |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
8 # |
736 | 9 |
1277 | 10 .data |
11 .align 8 | |
12 x_plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
13 plus_1f: .float 1.0 | |
14 | |
1173 | 15 .text |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
17 .align 16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
18 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
19 .globl dct64_MMX_3dnowex |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
20 dct64_MMX_3dnowex: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
21 pushl %ebx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
22 pushl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
23 pushl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
24 subl $256,%esp |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
25 movl 280(%esp),%eax |
781 | 26 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
27 leal 128(%esp),%edx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
28 movl 272(%esp),%esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
29 movl 276(%esp),%edi |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
30 movl $costab_mmx,%ebx |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
31 orl %ecx,%ecx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
32 movl %esp,%ecx |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
33 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
34 /* Phase 1*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
35 movq (%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
36 movq 8(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
37 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
38 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
39 movq 120(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
40 movq 112(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
41 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
42 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
43 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
44 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
45 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
46 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
47 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
48 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
49 pfmul (%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
50 pfmul 8(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
51 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
52 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
53 movq %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
54 movq %mm7, 112(%edx) |
781 | 55 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
56 movq 16(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
57 movq 24(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
58 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
59 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
60 movq 104(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
61 movq 96(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
62 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
63 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
64 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
65 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
66 movq %mm0, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
67 movq %mm4, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
68 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
69 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
70 pfmul 16(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
71 pfmul 24(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
72 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
73 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
74 movq %mm3, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
75 movq %mm7, 96(%edx) |
781 | 76 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
77 movq 32(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
78 movq 40(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
79 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
80 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
81 movq 88(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
82 movq 80(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
83 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
84 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
85 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
86 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
87 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
88 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
89 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
90 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
91 pfmul 32(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
92 pfmul 40(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
93 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
94 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
95 movq %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
96 movq %mm7, 80(%edx) |
781 | 97 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
98 movq 48(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
99 movq 56(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
100 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
101 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
102 movq 72(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
103 movq 64(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
104 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
105 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
106 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
107 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
108 movq %mm0, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
109 movq %mm4, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
110 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
111 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
112 pfmul 48(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
113 pfmul 56(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
114 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
115 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
116 movq %mm3, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
117 movq %mm7, 64(%edx) |
781 | 118 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
119 /* Phase 2*/ |
781 | 120 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
121 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
122 movq 8(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
123 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
124 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
125 movq 56(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
126 movq 48(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
127 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
128 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
129 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
130 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
131 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
132 movq %mm4, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
133 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
134 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
135 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
136 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
137 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
138 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
139 movq %mm3, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
140 movq %mm7, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
141 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
142 movq 16(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
143 movq 24(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
144 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
145 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
146 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
147 movq 32(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
148 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
149 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
150 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
151 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
152 movq %mm0, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
153 movq %mm4, 24(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
154 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
155 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
156 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
157 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
158 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
159 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
160 movq %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
161 movq %mm7, 32(%ecx) |
781 | 162 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
163 /* Phase 3*/ |
781 | 164 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
165 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
166 movq 72(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
167 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
168 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
169 movq 120(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
170 movq 112(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
171 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
172 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
173 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
174 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
175 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
176 movq %mm4, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
177 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
178 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
179 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
180 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
181 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
182 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
183 movq %mm3, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
184 movq %mm7, 112(%ecx) |
736 | 185 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
186 movq 80(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
187 movq 88(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
188 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
189 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
190 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
191 movq 96(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
192 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
193 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
194 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
195 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
196 movq %mm0, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
197 movq %mm4, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
198 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
199 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
200 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
201 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
202 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
203 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
204 movq %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
205 movq %mm7, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
206 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
207 /* Phase 4*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
208 |
1277 | 209 movq 96(%ebx), %mm2 |
210 movq 104(%ebx), %mm6 | |
211 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
212 movq (%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
213 movq 8(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
214 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
215 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
216 movq 24(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
217 movq 16(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
218 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
219 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
220 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
221 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
222 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
223 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
224 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
225 pfsub %mm5, %mm7 |
1277 | 226 pfmul %mm2, %mm3 |
227 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
228 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
229 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
230 movq %mm3, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
231 movq %mm7, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
232 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
233 movq 32(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
234 movq 40(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
235 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
236 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
237 movq 56(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
238 movq 48(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
239 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
240 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
241 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
242 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
243 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
244 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
245 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
246 pfsubr %mm5, %mm7 |
1277 | 247 pfmul %mm2, %mm3 |
248 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
249 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
250 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
251 movq %mm3, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
252 movq %mm7, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
253 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
254 movq 64(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
255 movq 72(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
256 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
257 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
258 movq 88(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
259 movq 80(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
260 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
261 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
262 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
263 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
264 movq %mm0, 64(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
265 movq %mm4, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
266 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
267 pfsub %mm5, %mm7 |
1277 | 268 pfmul %mm2, %mm3 |
269 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
270 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
271 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
272 movq %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
273 movq %mm7, 80(%edx) |
736 | 274 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
275 movq 96(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
276 movq 104(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
277 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
278 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
279 movq 120(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
280 movq 112(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
281 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
282 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
283 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
284 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
285 movq %mm0, 96(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
286 movq %mm4, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
287 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
288 pfsubr %mm5, %mm7 |
1277 | 289 pfmul %mm2, %mm3 |
290 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
291 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
292 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
293 movq %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
294 movq %mm7, 112(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
295 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
296 /* Phase 5 */ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
297 |
1277 | 298 movq 112(%ebx), %mm2 |
299 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
300 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
301 movq 16(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
302 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
303 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
304 movq 8(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
305 movq 24(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
306 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
307 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
308 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
309 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
310 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
311 movq %mm4, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
312 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
313 pfsubr %mm5, %mm7 |
1277 | 314 pfmul %mm2, %mm3 |
315 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
316 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
317 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
318 movq %mm3, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
319 movq %mm7, 24(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
320 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
321 movq 32(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
322 movq 48(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
323 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
324 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
325 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
326 movq 56(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
327 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
328 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
329 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
330 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
331 movq %mm0, 32(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
332 movq %mm4, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
333 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
334 pfsubr %mm5, %mm7 |
1277 | 335 pfmul %mm2, %mm3 |
336 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
337 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
338 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
339 movq %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
340 movq %mm7, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
341 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
342 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
343 movq 80(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
344 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
345 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
346 movq 72(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
347 movq 88(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
348 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
349 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
350 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
351 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
352 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
353 movq %mm4, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
354 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
355 pfsubr %mm5, %mm7 |
1277 | 356 pfmul %mm2, %mm3 |
357 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
358 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
359 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
360 movq %mm3, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
361 movq %mm7, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
362 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
363 movq 96(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
364 movq 112(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
365 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
366 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
367 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
368 movq 120(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
369 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
370 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
371 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
372 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
373 movq %mm0, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
374 movq %mm4, 112(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
375 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
376 pfsubr %mm5, %mm7 |
1277 | 377 pfmul %mm2, %mm3 |
378 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
379 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
380 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
381 movq %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
382 movq %mm7, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
383 |
1277 | 384 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
385 /* Phase 6. This is the end of easy road. */ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
386 /* Code below is coded in scalar mode. Should be optimized */ |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
387 |
1277 | 388 movd plus_1f, %mm6 |
389 punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ | |
390 movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
391 |
1277 | 392 movq 32(%ecx), %mm0 |
393 movq 64(%ecx), %mm2 | |
394 movq %mm0, %mm1 | |
395 movq %mm2, %mm3 | |
396 pxor %mm7, %mm1 | |
397 pxor %mm7, %mm3 | |
398 pfacc %mm1, %mm0 | |
399 pfacc %mm3, %mm2 | |
400 pfmul %mm6, %mm0 | |
401 pfmul %mm6, %mm2 | |
402 movq %mm0, 32(%edx) | |
403 movq %mm2, 64(%edx) | |
736 | 404 |
1277 | 405 movd 44(%ecx), %mm0 |
406 movd 40(%ecx), %mm2 | |
407 movd 120(%ebx), %mm3 | |
408 punpckldq 76(%ecx), %mm0 | |
409 punpckldq 72(%ecx), %mm2 | |
410 punpckldq %mm3, %mm3 | |
411 movq %mm0, %mm4 | |
412 movq %mm2, %mm5 | |
413 pfsub %mm2, %mm0 | |
414 pfmul %mm3, %mm0 | |
415 movq %mm0, %mm1 | |
416 pfadd %mm5, %mm0 | |
417 pfadd %mm4, %mm0 | |
418 movq %mm0, %mm2 | |
419 punpckldq %mm1, %mm0 | |
420 punpckhdq %mm1, %mm2 | |
421 movq %mm0, 40(%edx) | |
422 movq %mm2, 72(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
423 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
424 movd 48(%ecx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
425 pfsub 52(%ecx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
426 pfmul 120(%ebx), %mm3 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
427 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
428 movd 60(%ecx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
429 pfsub 56(%ecx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
430 pfmul 120(%ebx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
431 movq %mm2, %mm1 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
432 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
433 pfadd 56(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
434 pfadd 60(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
435 movq %mm1, %mm0 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
436 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
437 pfadd 48(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
438 pfadd 52(%ecx), %mm0 |
1277 | 439 pfadd %mm3, %mm1 |
440 punpckldq %mm2, %mm1 | |
441 pfadd %mm3, %mm2 | |
442 punpckldq %mm2, %mm0 | |
443 movq %mm1, 56(%edx) | |
444 movq %mm0, 48(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
445 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
446 /*---*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
447 |
1277 | 448 movd 92(%ecx), %mm1 |
449 pfsub 88(%ecx), %mm1 | |
450 pfmul 120(%ebx), %mm1 | |
451 movd %mm1, 92(%edx) | |
452 pfadd 92(%ecx), %mm1 | |
453 pfadd 88(%ecx), %mm1 | |
454 movq %mm1, %mm0 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
455 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
456 pfadd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
457 pfadd 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
458 movd %mm0, 80(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
459 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
460 movd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
461 pfsub 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
462 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
463 pfadd %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
464 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
465 movd %mm0, 84(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
466 movd %mm1, 88(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
467 |
1277 | 468 movq 96(%ecx), %mm0 |
469 movq %mm0, %mm1 | |
470 pxor %mm7, %mm1 | |
471 pfacc %mm1, %mm0 | |
472 pfmul %mm6, %mm0 | |
473 movq %mm0, 96(%edx) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
474 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
475 movd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
476 pfsub 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
477 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
478 movd %mm0, 108(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
479 pfadd 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
480 pfadd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
481 movd %mm0, 104(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
482 |
1277 | 483 movd 124(%ecx), %mm1 |
484 pfsub 120(%ecx), %mm1 | |
485 pfmul 120(%ebx), %mm1 | |
486 movd %mm1, 124(%edx) | |
487 pfadd 120(%ecx), %mm1 | |
488 pfadd 124(%ecx), %mm1 | |
489 movq %mm1, %mm0 | |
736 | 490 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
491 pfadd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
492 pfadd 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
493 movd %mm0, 112(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
494 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
495 movd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
496 pfsub 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
497 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
498 pfadd %mm0,%mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
499 pfadd 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
500 movd %mm0, 116(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
501 movd %mm1, 120(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
502 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
503 jnz .L01 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
504 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
505 /* Phase 7*/ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
506 /* Code below is coded in scalar mode. Should be optimized */ |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
507 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
508 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
509 pfadd 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
510 movd %mm0, 1024(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
511 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
512 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
513 pfsub 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
514 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
515 movd %mm0, (%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
516 movd %mm0, (%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
517 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
518 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
519 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
520 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
521 movd %mm0, 512(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
522 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
523 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
524 movd %mm0, 512(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
525 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
526 movd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
527 pfsub 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
528 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
529 movq %mm0, %mm3 |
781 | 530 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
531 movd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
532 pfsub 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
533 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
534 movd %mm0, 768(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
535 movq %mm0, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
536 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
537 pfadd 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
538 pfadd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
539 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
540 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
541 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
542 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
543 movd %mm0, 768(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
544 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
545 movd %mm1, 256(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
546 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
547 movd %mm2, 256(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
548 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
549 /* Phase 8*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
550 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
551 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
552 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
553 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
554 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
555 movd %mm0, 896(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
556 movd %mm1, 640(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
557 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
558 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
559 movd %mm0, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
560 movd %mm1, 384(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
561 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
562 movd 40(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
563 pfadd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
564 movd %mm0, 384(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
565 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
566 movd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
567 pfadd 36(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
568 movd %mm0, 128(%esi) |
781 | 569 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
570 movd 60(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
571 movd %mm0, 896(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
572 pfadd 44(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
573 movd %mm0, 640(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
574 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
575 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
576 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
577 movq 104(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
578 pfadd 112(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
579 pfadd 104(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
580 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
581 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
582 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
583 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
584 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
585 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
586 pfadd 72(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
587 movd %mm0, 960(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
588 movd %mm2, 704(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
589 movd %mm4, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
590 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
591 psrlq $32, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
592 psrlq $32, %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
593 movd %mm0, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
594 movd %mm2, 320(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
595 movd %mm4, 576(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
596 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
597 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
598 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
599 movd %mm1, 832(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
600 movd %mm3, 576(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
601 movd %mm5, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
602 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
603 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
604 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
605 movd %mm1, 192(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
606 movd %mm3, 448(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
607 movd %mm5, 704(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
608 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
609 movd 120(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
610 pfadd 100(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
611 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
612 pfadd 88(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
613 movd %mm0, 192(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
614 pfadd 68(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
615 movd %mm1, 64(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
616 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
617 movd 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
618 movd %mm0, 960(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
619 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
620 movd %mm0, 832(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
621 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
622 jmp .L_bye |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
623 .L01: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
624 /* Phase 9*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
625 |
1277 | 626 movq (%ecx), %mm0 |
627 movq %mm0, %mm1 | |
628 pxor %mm7, %mm1 | |
629 pfacc %mm1, %mm0 | |
630 pfmul %mm6, %mm0 | |
631 pf2id %mm0, %mm0 | |
632 movd %mm0, %eax | |
633 movw %ax, 512(%esi) | |
634 psrlq $32, %mm0 | |
635 movd %mm0, %eax | |
636 movw %ax, (%esi) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
637 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
638 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
639 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
640 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
641 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
642 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
643 movw %ax, 256(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
644 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
645 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
646 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
647 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
648 movw %ax, 256(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
649 |
1277 | 650 movd 16(%ecx), %mm3 |
651 pfsub 20(%ecx), %mm3 | |
652 pfmul 120(%ebx), %mm3 | |
653 movq %mm3, %mm2 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
654 |
1277 | 655 movd 28(%ecx), %mm2 |
656 pfsub 24(%ecx), %mm2 | |
657 pfmul 120(%ebx), %mm2 | |
658 movq %mm2, %mm1 | |
659 | |
660 pf2id %mm2, %mm7 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
661 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
662 movw %ax, 384(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
663 |
1277 | 664 pfadd 24(%ecx), %mm1 |
665 pfadd 28(%ecx), %mm1 | |
666 movq %mm1, %mm0 | |
667 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
668 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
669 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
670 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
671 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
672 movw %ax, 384(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
673 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
674 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
675 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
676 movw %ax, 128(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
677 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
678 pf2id %mm2, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
679 movd %mm2, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
680 movw %ax, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
681 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
682 /* Phase 10*/ |
736 | 683 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
684 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
685 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
686 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
687 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
688 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
689 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
690 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
691 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
692 movw %ax, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
693 movw %cx, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
694 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
695 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
696 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
697 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
698 movw %ax, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
699 movw %cx, 192(%edi) |
736 | 700 |
1277 | 701 movd 40(%edx), %mm3 |
702 movd 56(%edx), %mm4 | |
703 movd 60(%edx), %mm0 | |
704 movd 44(%edx), %mm2 | |
705 movd 120(%edx), %mm5 | |
706 punpckldq %mm4, %mm3 | |
707 punpckldq 124(%edx), %mm0 | |
708 pfadd 100(%edx), %mm5 | |
709 punpckldq 36(%edx), %mm4 | |
710 punpckldq 92(%edx), %mm2 | |
711 movq %mm5, %mm6 | |
712 pfadd %mm4, %mm3 | |
713 pf2id %mm0, %mm1 | |
714 pf2id %mm3, %mm3 | |
715 pfadd 88(%edx), %mm5 | |
716 movd %mm1, %eax | |
717 movd %mm3, %ecx | |
718 movw %ax, 448(%edi) | |
719 movw %cx, 192(%esi) | |
720 pf2id %mm5, %mm5 | |
721 psrlq $32, %mm1 | |
722 psrlq $32, %mm3 | |
723 movd %mm5, %ebx | |
724 movd %mm1, %eax | |
725 movd %mm3, %ecx | |
726 movw %bx, 96(%esi) | |
727 movw %ax, 480(%edi) | |
728 movw %cx, 64(%esi) | |
729 pfadd %mm2, %mm0 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
730 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
731 movd %mm0, %eax |
1277 | 732 pfadd 68(%edx), %mm6 |
733 movw %ax, 320(%edi) | |
734 psrlq $32, %mm0 | |
735 pf2id %mm6, %mm6 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
736 movd %mm0, %eax |
1277 | 737 movd %mm6, %ebx |
738 movw %ax, 416(%edi) | |
739 movw %bx, 32(%esi) | |
736 | 740 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
741 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
742 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
743 movq 104(%edx), %mm4 |
1277 | 744 pfadd %mm2, %mm0 |
745 pfadd %mm4, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
746 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
747 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
748 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
749 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
750 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
751 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
752 pfadd 72(%edx), %mm4 |
1277 | 753 pf2id %mm0, %mm0 |
754 pf2id %mm2, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
755 pf2id %mm4, %mm4 |
1277 | 756 movd %mm0, %eax |
757 movd %mm2, %ecx | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
758 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
759 movw %ax, 480(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
760 movw %cx, 352(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
761 movw %bx, 224(%esi) |
1277 | 762 psrlq $32, %mm0 |
763 psrlq $32, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
764 psrlq $32, %mm4 |
1277 | 765 movd %mm0, %eax |
766 movd %mm2, %ecx | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
767 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
768 movw %ax, 32(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
769 movw %cx, 160(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
770 movw %bx, 288(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
771 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
772 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
773 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
774 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
775 pf2id %mm3, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
776 pf2id %mm5, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
777 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
778 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
779 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
780 movw %ax, 416(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
781 movw %cx, 288(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
782 movw %bx, 160(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
783 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
784 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
785 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
786 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
787 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
788 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
789 movw %ax, 96(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
790 movw %cx, 224(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
791 movw %bx, 352(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
792 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
793 movsw |
736 | 794 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
795 .L_bye: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
796 addl $256,%esp |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
797 femms |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
798 popl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
799 popl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
800 popl %ebx |
1277 | 801 ret $12 |