annotate mp3lib/decode_MMX.s @ 1446:a49fd85fc431

DATADIR/font/ patch by Adam Tla/lka atlka@pg.gda.pl
author arpi
date Mon, 06 Aug 2001 00:22:13 +0000
parents c73912315dbf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
1 # this code comes under GPL
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
2 # This code was taken from http://www.mpg123.org
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
3 # See ChangeLog of mpg123-0.59s-pre.1 for detail
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
4 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
5 #
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
6 # Local ChangeLog:
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
7 # - Partial loops unrolling and removing MOVW insn from loops
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
8 #
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
9
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
10 .data
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
11 .align 8
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
12 null_one: .long 0x0000ffff, 0x0000ffff
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
13 one_null: .long 0xffff0000, 0xffff0000
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
14 .globl costab_mmx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
15 costab_mmx:
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
16 .long 1056974725
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
17 .long 1057056395
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
18 .long 1057223771
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
19 .long 1057485416
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
20 .long 1057855544
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
21 .long 1058356026
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
22 .long 1059019886
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
23 .long 1059897405
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
24 .long 1061067246
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
25 .long 1062657950
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
26 .long 1064892987
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
27 .long 1066774581
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
28 .long 1069414683
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
29 .long 1073984175
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
30 .long 1079645762
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
31 .long 1092815430
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
32 .long 1057005197
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
33 .long 1057342072
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
34 .long 1058087743
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
35 .long 1059427869
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
36 .long 1061799040
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
37 .long 1065862217
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
38 .long 1071413542
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
39 .long 1084439708
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
40 .long 1057128951
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
41 .long 1058664893
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
42 .long 1063675095
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
43 .long 1076102863
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
44 .long 1057655764
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
45 .long 1067924853
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1259
diff changeset
46 .long 1060439283
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
47
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
48 .text
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
49
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
50 .globl synth_1to1_MMX_s
1283
c73912315dbf My 2.10.1 gnu assembler doesn't like C comment syntax in assembler files. Use
jkeil
parents: 1277
diff changeset
51 //
c73912315dbf My 2.10.1 gnu assembler doesn't like C comment syntax in assembler files. Use
jkeil
parents: 1277
diff changeset
52 // void synth_1to1_MMX_s(real *bandPtr, int channel, short *samples,
c73912315dbf My 2.10.1 gnu assembler doesn't like C comment syntax in assembler files. Use
jkeil
parents: 1277
diff changeset
53 // short *buffs, int *bo);
c73912315dbf My 2.10.1 gnu assembler doesn't like C comment syntax in assembler files. Use
jkeil
parents: 1277
diff changeset
54 //
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
55 synth_1to1_MMX_s:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
56 pushl %ebp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
57 pushl %edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
58 pushl %esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
59 pushl %ebx
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
60 movl 24(%esp),%ecx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
61 movl 28(%esp),%edi
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
62 movl $15,%ebx
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
63 movl 36(%esp),%edx
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
64 leal (%edi,%ecx,2),%edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
65 decl %ecx
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
66 movl 32(%esp),%esi
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
67 movl (%edx),%eax
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
68 jecxz .L1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
69 decl %eax
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
70 andl %ebx,%eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
71 leal 1088(%esi),%esi
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
72 movl %eax,(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
73 .L1:
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
74 leal (%esi,%eax,2),%edx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
75 movl %eax,%ebp
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
76 incl %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
77 pushl 20(%esp)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
78 andl %ebx,%eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
79 leal 544(%esi,%eax,2),%ecx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
80 incl %ebx
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
81 testl $1, %eax
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
82 jnz .L2
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
83 xchgl %edx,%ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
84 incl %ebp
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
85 leal 544(%esi),%esi
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
86 .L2:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
87 emms
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
88 pushl %edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
89 pushl %ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
90 call *dct64_MMX_func
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
91 leal 1(%ebx), %ecx
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
92 subl %ebp,%ebx
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
93 pushl %ecx
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
94 leal decwins(%ebx,%ebx,1), %edx
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
95 shrl $1, %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
96 .align 16
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
97 .L3:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
98 movq (%edx),%mm0
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
99 movq 64(%edx),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
100 pmaddwd (%esi),%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
101 pmaddwd 32(%esi),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
102 movq 8(%edx),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
103 movq 72(%edx),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
104 pmaddwd 8(%esi),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
105 pmaddwd 40(%esi),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
106 movq 16(%edx),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
107 movq 80(%edx),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
108 pmaddwd 16(%esi),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
109 pmaddwd 48(%esi),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
110 movq 24(%edx),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
111 movq 88(%edx),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
112 pmaddwd 24(%esi),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
113 pmaddwd 56(%esi),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
114 paddd %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
115 paddd %mm5,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
116 paddd %mm2,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
117 paddd %mm6,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
118 paddd %mm3,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
119 paddd %mm7,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
120 movq %mm0,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
121 movq %mm4,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
122 psrlq $32,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
123 psrlq $32,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
124 paddd %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
125 paddd %mm5,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
126 psrad $13,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
127 psrad $13,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
128 packssdw %mm0,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
129 packssdw %mm4,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
130
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
131 movq (%edi), %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
132 punpckldq %mm4, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
133 pand one_null, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
134 pand null_one, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
135 por %mm0, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
136 movq %mm1,(%edi)
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
137
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
138 leal 64(%esi),%esi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
139 leal 128(%edx),%edx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
140 leal 8(%edi),%edi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
141
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
142 decl %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
143 jnz .L3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
144
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
145 popl %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
146 andl $1, %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
147 jecxz .next_loop
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
148
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
149 movq (%edx),%mm0
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
150 pmaddwd (%esi),%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
151 movq 8(%edx),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
152 pmaddwd 8(%esi),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
153 movq 16(%edx),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
154 pmaddwd 16(%esi),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
155 movq 24(%edx),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
156 pmaddwd 24(%esi),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
157 paddd %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
158 paddd %mm2,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
159 paddd %mm3,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
160 movq %mm0,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
161 psrlq $32,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
162 paddd %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
163 psrad $13,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
164 packssdw %mm0,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
165 movd %mm0,%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
166 movw %ax, (%edi)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
167 leal 32(%esi),%esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
168 leal 64(%edx),%edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
169 leal 4(%edi),%edi
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
170
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
171 .next_loop:
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
172 subl $64,%esi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
173 movl $7,%ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
174 .align 16
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
175 .L4:
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
176 movq (%edx),%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
177 movq 64(%edx),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
178 pmaddwd (%esi),%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
179 pmaddwd -32(%esi),%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
180 movq 8(%edx),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
181 movq 72(%edx),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
182 pmaddwd 8(%esi),%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
183 pmaddwd -24(%esi),%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
184 movq 16(%edx),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
185 movq 80(%edx),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
186 pmaddwd 16(%esi),%mm2
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
187 pmaddwd -16(%esi),%mm6
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
188 movq 24(%edx),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
189 movq 88(%edx),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
190 pmaddwd 24(%esi),%mm3
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
191 pmaddwd -8(%esi),%mm7
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
192 paddd %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
193 paddd %mm5,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
194 paddd %mm2,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
195 paddd %mm6,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
196 paddd %mm3,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
197 paddd %mm7,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
198 movq %mm0,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
199 movq %mm4,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
200 psrlq $32,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
201 psrlq $32,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
202 paddd %mm0,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
203 paddd %mm4,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
204 psrad $13,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
205 psrad $13,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
206 packssdw %mm1,%mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
207 packssdw %mm5,%mm5
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
208 psubd %mm0,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
209 psubd %mm4,%mm4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
210 psubsw %mm1,%mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
211 psubsw %mm5,%mm4
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
212
1259
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
213 movq (%edi), %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
214 punpckldq %mm4, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
215 pand one_null, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
216 pand null_one, %mm0
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
217 por %mm0, %mm1
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
218 movq %mm1,(%edi)
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
219
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
220 subl $64,%esi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
221 addl $128,%edx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
222 leal 8(%edi),%edi
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
223 decl %ecx
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
224 jnz .L4
9bf97b404134 Partial loops unrolling
nick
parents: 1245
diff changeset
225
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
226 movq (%edx),%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
227 pmaddwd (%esi),%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
228 movq 8(%edx),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
229 pmaddwd 8(%esi),%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
230 movq 16(%edx),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
231 pmaddwd 16(%esi),%mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
232 movq 24(%edx),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
233 pmaddwd 24(%esi),%mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
234 paddd %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
235 paddd %mm2,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
236 paddd %mm3,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
237 movq %mm0,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
238 psrlq $32,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
239 paddd %mm0,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
240 psrad $13,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
241 packssdw %mm1,%mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
242 psubd %mm0,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
243 psubsw %mm1,%mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
244 movd %mm0,%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
245 movw %ax,(%edi)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
246
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
247 emms
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
248 popl %ebx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
249 popl %esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
250 popl %edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
251 popl %ebp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
diff changeset
252 ret