Mercurial > mplayer.hg
annotate mp3lib/dct64_3dnow.s @ 1271:2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
author | nick |
---|---|
date | Wed, 04 Jul 2001 09:47:56 +0000 |
parents | 03b7e2955a20 |
children | 3a9699d9e7da |
rev | line source |
---|---|
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
1 # This code was taken from http://www.mpg123.org |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
4 # Partial 3dnow! optimization by Nick Kurshev |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
5 # |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
6 # TODO: optimize scalar 3dnow! code |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
7 # Warning: Phases 7 & 8 are not tested |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
8 # |
1 | 9 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
10 .text |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
11 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
12 .align 16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
13 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
14 .globl dct64_MMX_3dnow |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
15 dct64_MMX_3dnow: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
16 pushl %ebx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
17 pushl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
18 pushl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
19 subl $256,%esp |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
20 movl 280(%esp),%eax |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
21 leal 128(%esp),%edx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
22 movl 272(%esp),%esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
23 movl 276(%esp),%edi |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
24 movl $costab_mmx,%ebx |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
25 orl %ecx,%ecx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
26 movl %esp,%ecx |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
27 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
28 /* Phase 1*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
29 movq (%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
30 movq 8(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
31 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
32 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
33 movq 120(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
34 movq 112(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
35 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
36 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
37 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
38 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
39 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
40 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
41 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
42 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
43 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
44 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
45 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
46 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
47 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
48 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
49 pfmul (%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
50 pfmul 8(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
51 movd %mm3, 124(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
52 movd %mm7, 116(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
53 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
54 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
55 movd %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
56 movd %mm7, 112(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
57 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
58 movq 16(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
59 movq 24(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
60 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
61 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
62 movq 104(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
63 movq 96(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
64 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
65 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
66 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
67 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
68 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
69 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
70 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
71 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
72 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
73 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
74 movq %mm0, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
75 movq %mm4, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
76 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
77 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
78 pfmul 16(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
79 pfmul 24(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
80 movd %mm3, 108(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
81 movd %mm7, 100(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
82 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
83 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
84 movd %mm3, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
85 movd %mm7, 96(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
86 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
87 movq 32(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
88 movq 40(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
89 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
90 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
91 movq 88(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
92 movq 80(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
93 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
94 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
95 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
96 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
97 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
98 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
99 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
100 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
101 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
102 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
103 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
104 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
105 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
106 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
107 pfmul 32(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
108 pfmul 40(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
109 movd %mm3, 92(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
110 movd %mm7, 84(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
111 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
112 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
113 movd %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
114 movd %mm7, 80(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
115 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
116 movq 48(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
117 movq 56(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
118 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
119 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
120 movq 72(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
121 movq 64(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
122 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
123 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
124 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
125 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
126 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
127 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
128 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
129 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
130 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
131 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
132 movq %mm0, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
133 movq %mm4, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
134 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
135 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
136 pfmul 48(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
137 pfmul 56(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
138 movd %mm3, 76(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
139 movd %mm7, 68(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
140 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
141 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
142 movd %mm3, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
143 movd %mm7, 64(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
144 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
145 /* Phase 2*/ |
1 | 146 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
147 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
148 movq 8(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
149 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
150 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
151 movq 56(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
152 movq 48(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
153 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
154 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
155 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
156 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
157 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
158 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
159 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
160 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
161 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
162 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
163 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
164 movq %mm4, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
165 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
166 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
167 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
168 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
169 movd %mm3, 60(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
170 movd %mm7, 52(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
171 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
172 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
173 movd %mm3, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
174 movd %mm7, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
175 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
176 movq 16(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
177 movq 24(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
178 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
179 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
180 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
181 movq 32(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
182 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
183 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
184 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
185 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
186 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
187 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
188 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
189 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
190 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
191 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
192 movq %mm0, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
193 movq %mm4, 24(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
194 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
195 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
196 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
197 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
198 movd %mm3, 44(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
199 movd %mm7, 36(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
200 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
201 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
202 movd %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
203 movd %mm7, 32(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
204 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
205 /* Phase 3*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
206 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
207 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
208 movq 72(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
209 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
210 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
211 movq 120(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
212 movq 112(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
213 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
214 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
215 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
216 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
217 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
218 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
219 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
220 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
221 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
222 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
223 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
224 movq %mm4, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
225 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
226 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
227 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
228 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
229 movd %mm3, 124(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
230 movd %mm7, 116(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
231 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
232 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
233 movd %mm3, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
234 movd %mm7, 112(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
235 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
236 movq 80(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
237 movq 88(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
238 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
239 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
240 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
241 movq 96(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
242 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
243 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
244 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
245 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
246 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
247 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
248 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
249 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
250 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
251 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
252 movq %mm0, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
253 movq %mm4, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
254 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
255 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
256 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
257 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
258 movd %mm3, 108(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
259 movd %mm7, 100(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
260 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
261 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
262 movd %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
263 movd %mm7, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
264 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
265 /* Phase 4*/ |
1 | 266 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
267 movq (%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
268 movq 8(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
269 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
270 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
271 movq 24(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
272 movq 16(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
273 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
274 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
275 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
276 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
277 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
278 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
279 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
280 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
281 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
282 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
283 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
284 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
285 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
286 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
287 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
288 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
289 movd %mm3, 28(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
290 movd %mm7, 20(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
291 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
292 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
293 movd %mm3, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
294 movd %mm7, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
295 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
296 movq 32(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
297 movq 40(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
298 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
299 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
300 movq 56(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
301 movq 48(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
302 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
303 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
304 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
305 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
306 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
307 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
308 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
309 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
310 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
311 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
312 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
313 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
314 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
315 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
316 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
317 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
318 movd %mm3, 60(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
319 movd %mm7, 52(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
320 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
321 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
322 movd %mm3, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
323 movd %mm7, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
324 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
325 movq 64(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
326 movq 72(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
327 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
328 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
329 movq 88(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
330 movq 80(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
331 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
332 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
333 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
334 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
335 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
336 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
337 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
338 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
339 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
340 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
341 movq %mm0, 64(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
342 movq %mm4, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
343 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
344 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
345 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
346 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
347 movd %mm3, 92(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
348 movd %mm7, 84(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
349 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
350 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
351 movd %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
352 movd %mm7, 80(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
353 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
354 movq 96(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
355 movq 104(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
356 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
357 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
358 movq 120(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
359 movq 112(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
360 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
361 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
362 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
363 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
364 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
365 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
366 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
367 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
368 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
369 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
370 movq %mm0, 96(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
371 movq %mm4, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
372 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
373 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
374 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
375 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
376 movd %mm3, 124(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
377 movd %mm7, 116(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
378 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
379 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
380 movd %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
381 movd %mm7, 112(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
382 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
383 /* Phase 5 */ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
384 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
385 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
386 movq 16(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
387 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
388 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
389 movq 8(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
390 movq 24(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
391 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
392 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
393 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
394 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
395 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
396 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
397 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
398 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
399 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
400 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
401 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
402 movq %mm4, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
403 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
404 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
405 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
406 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
407 movd %mm3, 12(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
408 movd %mm7, 28(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
409 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
410 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
411 movd %mm3, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
412 movd %mm7, 24(%ecx) |
1 | 413 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
414 movq 32(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
415 movq 48(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
416 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
417 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
418 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
419 movq 56(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
420 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
421 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
422 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
423 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
424 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
425 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
426 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
427 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
428 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
429 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
430 movq %mm0, 32(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
431 movq %mm4, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
432 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
433 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
434 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
435 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
436 movd %mm3, 44(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
437 movd %mm7, 60(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
438 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
439 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
440 movd %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
441 movd %mm7, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
442 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
443 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
444 movq 80(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
445 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
446 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
447 movq 72(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
448 movq 88(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
449 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
450 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
451 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
452 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
453 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
454 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
455 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
456 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
457 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
458 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
459 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
460 movq %mm4, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
461 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
462 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
463 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
464 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
465 movd %mm3, 76(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
466 movd %mm7, 92(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
467 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
468 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
469 movd %mm3, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
470 movd %mm7, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
471 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
472 movq 96(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
473 movq 112(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
474 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
475 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
476 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
477 movq 120(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
478 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
479 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
480 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
481 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
482 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
483 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
484 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
485 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
486 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
487 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
488 movq %mm0, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
489 movq %mm4, 112(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
490 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
491 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
492 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
493 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
494 movd %mm3, 108(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
495 movd %mm7, 124(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
496 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
497 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
498 movd %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
499 movd %mm7, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
500 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
501 /* Phase 6. This is the end of easy road. */ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
502 /* Code below is coded in scalar mode. Should be optimized */ |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
503 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
504 movd 32(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
505 pfadd 36(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
506 movd %mm0, 32(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
507 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
508 movd 32(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
509 pfsub 36(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
510 pfmul 120(%ebx),%mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
511 movd %mm0, 36(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
512 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
513 movd 44(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
514 pfsub 40(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
515 pfmul 120(%ebx),%mm0 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
516 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
517 movd %mm0, 44(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
518 pfadd 40(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
519 pfadd 44(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
520 movd %mm0, 40(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
521 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
522 movd 48(%ecx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
523 pfsub 52(%ecx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
524 pfmul 120(%ebx), %mm3 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
525 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
526 movd 60(%ecx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
527 pfsub 56(%ecx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
528 pfmul 120(%ebx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
529 movq %mm2, %mm1 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
530 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
531 pfadd 56(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
532 pfadd 60(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
533 movq %mm1, %mm0 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
534 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
535 pfadd 48(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
536 pfadd 52(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
537 movd %mm0, 48(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
538 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
539 movd %mm1, 56(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
540 movd %mm2, 60(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
541 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
542 movd %mm2, 52(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
543 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
544 /*---*/ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
545 movd 64(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
546 pfadd 68(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
547 movd %mm0, 64(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
548 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
549 movd 64(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
550 pfsub 68(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
551 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
552 movd %mm0, 68(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
553 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
554 movd 76(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
555 pfsub 72(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
556 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
557 movd %mm0, 76(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
558 pfadd 72(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
559 pfadd 76(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
560 movd %mm0, 72(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
561 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
562 movd 92(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
563 pfsub 88(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
564 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
565 movd %mm0, 92(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
566 pfadd 92(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
567 pfadd 88(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
568 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
569 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
570 pfadd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
571 pfadd 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
572 movd %mm0, 80(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
573 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
574 movd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
575 pfsub 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
576 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
577 pfadd %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
578 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
579 movd %mm0, 84(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
580 movd %mm1, 88(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
581 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
582 movd 96(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
583 pfadd 100(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
584 movd %mm0, 96(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
585 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
586 movd 96(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
587 pfsub 100(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
588 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
589 movd %mm0, 100(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
590 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
591 movd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
592 pfsub 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
593 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
594 movd %mm0, 108(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
595 pfadd 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
596 pfadd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
597 movd %mm0, 104(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
598 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
599 movd 124(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
600 pfsub 120(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
601 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
602 movd %mm0, 124(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
603 pfadd 120(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
604 pfadd 124(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
605 movq %mm0, %mm1 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
606 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
607 pfadd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
608 pfadd 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
609 movd %mm0, 112(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
610 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
611 movd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
612 pfsub 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
613 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
614 pfadd %mm0,%mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
615 pfadd 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
616 movd %mm0, 116(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
617 movd %mm1, 120(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
618 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
619 jnz .L01 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
620 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
621 /* Phase 7*/ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
622 /* Code below is coded in scalar mode. Should be optimized */ |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
623 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
624 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
625 pfadd 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
626 movd %mm0, 1024(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
627 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
628 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
629 pfsub 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
630 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
631 movd %mm0, (%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
632 movd %mm0, (%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
633 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
634 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
635 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
636 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
637 movd %mm0, 512(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
638 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
639 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
640 movd %mm0, 512(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
641 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
642 movd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
643 pfsub 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
644 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
645 movq %mm0, %mm3 |
1 | 646 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
647 movd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
648 pfsub 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
649 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
650 movd %mm0, 768(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
651 movq %mm0, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
652 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
653 pfadd 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
654 pfadd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
655 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
656 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
657 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
658 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
659 movd %mm0, 768(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
660 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
661 movd %mm1, 256(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
662 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
663 movd %mm2, 256(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
664 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
665 /* Phase 8*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
666 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
667 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
668 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
669 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
670 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
671 movd %mm0, 896(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
672 movd %mm1, 640(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
673 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
674 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
675 movd %mm0, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
676 movd %mm1, 384(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
677 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
678 movd 40(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
679 pfadd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
680 movd %mm0, 384(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
681 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
682 movd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
683 pfadd 36(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
684 movd %mm0, 128(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
685 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
686 movd 60(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
687 movd %mm0, 896(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
688 pfadd 44(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
689 movd %mm0, 640(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
690 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
691 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
692 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
693 movq 104(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
694 pfadd 112(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
695 pfadd 104(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
696 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
697 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
698 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
699 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
700 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
701 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
702 pfadd 72(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
703 movd %mm0, 960(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
704 movd %mm2, 704(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
705 movd %mm4, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
706 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
707 psrlq $32, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
708 psrlq $32, %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
709 movd %mm0, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
710 movd %mm2, 320(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
711 movd %mm4, 576(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
712 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
713 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
714 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
715 movd %mm1, 832(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
716 movd %mm3, 576(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
717 movd %mm5, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
718 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
719 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
720 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
721 movd %mm1, 192(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
722 movd %mm3, 448(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
723 movd %mm5, 704(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
724 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
725 movd 120(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
726 pfadd 100(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
727 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
728 pfadd 88(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
729 movd %mm0, 192(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
730 pfadd 68(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
731 movd %mm1, 64(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
732 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
733 movd 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
734 movd %mm0, 960(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
735 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
736 movd %mm0, 832(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
737 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
738 jmp .L_bye |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
739 .L01: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
740 /* Phase 9*/ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
741 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
742 pfadd 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
743 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
744 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
745 movw %ax, 512(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
746 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
747 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
748 pfsub 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
749 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
750 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
751 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
752 movw %ax, (%esi) |
1 | 753 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
754 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
755 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
756 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
757 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
758 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
759 movw %ax, 256(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
760 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
761 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
762 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
763 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
764 movw %ax, 256(%esi) |
1 | 765 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
766 movd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
767 pfsub 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
768 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
769 movq %mm0, %mm3 |
1 | 770 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
771 movd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
772 pfsub 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
773 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
774 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
775 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
776 movw %ax, 384(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
777 movq %mm0, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
778 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
779 pfadd 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
780 pfadd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
781 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
782 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
783 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
784 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
785 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
786 movw %ax, 384(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
787 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
788 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
789 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
790 movw %ax, 128(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
791 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
792 pf2id %mm2, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
793 movd %mm2, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
794 movw %ax, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
795 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
796 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
797 /* Phase 10*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
798 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
799 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
800 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
801 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
802 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
803 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
804 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
805 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
806 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
807 movw %ax, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
808 movw %cx, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
809 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
810 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
811 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
812 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
813 movw %ax, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
814 movw %cx, 192(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
815 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
816 movd 40(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
817 pfadd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
818 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
819 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
820 movw %ax, 192(%esi) |
1 | 821 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
822 movd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
823 pfadd 36(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
824 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
825 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
826 movw %ax, 64(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
827 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
828 movd 60(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
829 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
830 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
831 movw %ax, 448(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
832 pfadd 44(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
833 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
834 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
835 movw %ax, 320(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
836 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
837 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
838 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
839 movq 104(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
840 pfadd 112(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
841 pfadd 104(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
842 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
843 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
844 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
845 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
846 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
847 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
848 pfadd 72(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
849 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
850 pf2id %mm2, %mm6 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
851 pf2id %mm4, %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
852 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
853 movd %mm6, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
854 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
855 movw %ax, 480(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
856 movw %cx, 352(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
857 movw %bx, 224(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
858 psrlq $32, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
859 psrlq $32, %mm6 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
860 psrlq $32, %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
861 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
862 movd %mm6, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
863 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
864 movw %ax, 32(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
865 movw %cx, 160(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
866 movw %bx, 288(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
867 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
868 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
869 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
870 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
871 pf2id %mm3, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
872 pf2id %mm5, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
873 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
874 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
875 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
876 movw %ax, 416(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
877 movw %cx, 288(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
878 movw %bx, 160(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
879 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
880 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
881 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
882 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
883 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
884 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
885 movw %ax, 96(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
886 movw %cx, 224(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
887 movw %bx, 352(%edi) |
1 | 888 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
889 movd 120(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
890 pfadd 100(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
891 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
892 pfadd 88(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
893 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
894 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
895 movw %ax, 96(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
896 pfadd 68(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
897 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
898 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
899 movw %ax, 32(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
900 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
901 movq 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
902 pf2id %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
903 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
904 movw %ax, 480(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
905 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
906 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
907 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
908 movw %ax, 416(%edi) |
1 | 909 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
910 movsw |
1 | 911 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
912 .L_bye: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
913 addl $256,%esp |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
914 femms |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
915 popl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
916 popl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
917 popl %ebx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
918 ret |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
919 |
1 | 920 |