annotate mp3lib/decode_MMX.s @ 1271:2864e32cd267

Finished 3dnow optimization (in scalar mode) and minor improvements
author nick
date Wed, 04 Jul 2001 09:47:56 +0000
parents 9bf97b404134
children 3a9699d9e7da
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
1 # this code comes under GPL
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
2 # This code was taken from http://www.mpg123.org
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
3 # See ChangeLog of mpg123-0.59s-pre.1 for detail
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
4 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
5 #
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
6 # Local ChangeLog:
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
7 # - Partial loops unrolling and removing MOVW insn from loops
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
8 #
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
9
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
10 .data
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
11 .align 8
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
12 null_one: .long 0x0000ffff, 0x0000ffff
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
13 one_null: .long 0xffff0000, 0xffff0000
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
14 .globl costab_mmx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
15 costab_mmx:
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
16 .long 1056974725
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
17 .long 1057056395
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
18 .long 1057223771
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
19 .long 1057485416
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
20 .long 1057855544
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
21 .long 1058356026
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
22 .long 1059019886
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
23 .long 1059897405
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
24 .long 1061067246
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
25 .long 1062657950
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
26 .long 1064892987
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
27 .long 1066774581
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
28 .long 1069414683
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
29 .long 1073984175
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
30 .long 1079645762
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
31 .long 1092815430
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
32 .long 1057005197
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
33 .long 1057342072
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
34 .long 1058087743
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
35 .long 1059427869
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
36 .long 1061799040
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
37 .long 1065862217
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
38 .long 1071413542
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
39 .long 1084439708
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
40 .long 1057128951
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
41 .long 1058664893
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
42 .long 1063675095
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
43 .long 1076102863
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
44 .long 1057655764
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
45 .long 1067924853
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
46 .long 1060439283
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
47
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
48 .text
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
49
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
50 .globl synth_1to1_MMX_s
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
51
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
52 synth_1to1_MMX_s:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
53 pushl %ebp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
54 pushl %edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
55 pushl %esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
56 pushl %ebx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
57 movl 24(%esp),%ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
58 movl 28(%esp),%edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
59 movl $15,%ebx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
60 movl 36(%esp),%edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
61 leal (%edi,%ecx,2),%edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
62 decl %ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
63 movl 32(%esp),%esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
64 movl (%edx),%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
65 jecxz .L1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
66 decl %eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
67 andl %ebx,%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
68 leal 1088(%esi),%esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
69 movl %eax,(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
70 .L1:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
71 leal (%esi,%eax,2),%edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
72 movl %eax,%ebp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
73 incl %eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
74 pushl 20(%esp)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
75 andl %ebx,%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
76 leal 544(%esi,%eax,2),%ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
77 incl %ebx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
78 testl $1, %eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
79 jnz .L2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
80 xchgl %edx,%ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
81 incl %ebp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
82 leal 544(%esi),%esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
83 .L2:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
84 emms
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
85 pushl %edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
86 pushl %ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
87 call *dct64_MMX_func
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
88 addl $12,%esp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
89 leal 1(%ebx), %ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
90 subl %ebp,%ebx
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
91 pushl %ecx
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
92 leal decwins(%ebx,%ebx,1), %edx
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
93 shrl $1, %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
94 .align 16
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
95 .L3:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
96 movq (%edx),%mm0
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
97 movq 64(%edx),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
98 pmaddwd (%esi),%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
99 pmaddwd 32(%esi),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
100 movq 8(%edx),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
101 movq 72(%edx),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
102 pmaddwd 8(%esi),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
103 pmaddwd 40(%esi),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
104 movq 16(%edx),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
105 movq 80(%edx),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
106 pmaddwd 16(%esi),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
107 pmaddwd 48(%esi),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
108 movq 24(%edx),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
109 movq 88(%edx),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
110 pmaddwd 24(%esi),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
111 pmaddwd 56(%esi),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
112 paddd %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
113 paddd %mm5,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
114 paddd %mm2,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
115 paddd %mm6,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
116 paddd %mm3,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
117 paddd %mm7,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
118 movq %mm0,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
119 movq %mm4,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
120 psrlq $32,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
121 psrlq $32,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
122 paddd %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
123 paddd %mm5,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
124 psrad $13,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
125 psrad $13,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
126 packssdw %mm0,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
127 packssdw %mm4,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
128
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
129 movq (%edi), %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
130 punpckldq %mm4, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
131 pand one_null, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
132 pand null_one, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
133 por %mm0, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
134 movq %mm1,(%edi)
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
135
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
136 leal 64(%esi),%esi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
137 leal 128(%edx),%edx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
138 leal 8(%edi),%edi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
139
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
140 decl %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
141 jnz .L3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
142
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
143 popl %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
144 andl $1, %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
145 jecxz .next_loop
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
146
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
147 movq (%edx),%mm0
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
148 pmaddwd (%esi),%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
149 movq 8(%edx),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
150 pmaddwd 8(%esi),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
151 movq 16(%edx),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
152 pmaddwd 16(%esi),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
153 movq 24(%edx),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
154 pmaddwd 24(%esi),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
155 paddd %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
156 paddd %mm2,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
157 paddd %mm3,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
158 movq %mm0,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
159 psrlq $32,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
160 paddd %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
161 psrad $13,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
162 packssdw %mm0,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
163 movd %mm0,%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
164 movw %ax, (%edi)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
165 leal 32(%esi),%esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
166 leal 64(%edx),%edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
167 leal 4(%edi),%edi
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
168
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
169 .next_loop:
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
170 subl $64,%esi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
171 movl $7,%ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
172 .align 16
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
173 .L4:
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
174 movq (%edx),%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
175 movq 64(%edx),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
176 pmaddwd (%esi),%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
177 pmaddwd -32(%esi),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
178 movq 8(%edx),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
179 movq 72(%edx),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
180 pmaddwd 8(%esi),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
181 pmaddwd -24(%esi),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
182 movq 16(%edx),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
183 movq 80(%edx),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
184 pmaddwd 16(%esi),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
185 pmaddwd -16(%esi),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
186 movq 24(%edx),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
187 movq 88(%edx),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
188 pmaddwd 24(%esi),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
189 pmaddwd -8(%esi),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
190 paddd %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
191 paddd %mm5,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
192 paddd %mm2,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
193 paddd %mm6,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
194 paddd %mm3,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
195 paddd %mm7,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
196 movq %mm0,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
197 movq %mm4,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
198 psrlq $32,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
199 psrlq $32,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
200 paddd %mm0,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
201 paddd %mm4,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
202 psrad $13,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
203 psrad $13,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
204 packssdw %mm1,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
205 packssdw %mm5,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
206 psubd %mm0,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
207 psubd %mm4,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
208 psubsw %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
209 psubsw %mm5,%mm4
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
210
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
211 movq (%edi), %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
212 punpckldq %mm4, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
213 pand one_null, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
214 pand null_one, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
215 por %mm0, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
216 movq %mm1,(%edi)
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
217
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
218 subl $64,%esi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
219 addl $128,%edx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
220 leal 8(%edi),%edi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
221 decl %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
222 jnz .L4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
223
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
224 movq (%edx),%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
225 pmaddwd (%esi),%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
226 movq 8(%edx),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
227 pmaddwd 8(%esi),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
228 movq 16(%edx),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
229 pmaddwd 16(%esi),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
230 movq 24(%edx),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
231 pmaddwd 24(%esi),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
232 paddd %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
233 paddd %mm2,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
234 paddd %mm3,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
235 movq %mm0,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
236 psrlq $32,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
237 paddd %mm0,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
238 psrad $13,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
239 packssdw %mm1,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
240 psubd %mm0,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
241 psubsw %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
242 movd %mm0,%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
243 movw %ax,(%edi)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
244
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
245 emms
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
246 popl %ebx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
247 popl %esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
248 popl %edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
249 popl %ebp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
250 ret
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
251
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
252