Mercurial > mplayer.hg
annotate mp3lib/dct64_k7.s @ 2316:bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
added half uv interpolation support
added prefetch
BGR15 support in MMX (untested) (so BGR15,16,24,32 are supported)
special unscaled height version (not much faster but it doesnt interpolate uv vertically)
author | michael |
---|---|
date | Sat, 20 Oct 2001 21:12:09 +0000 |
parents | 175423b2691e |
children |
rev | line source |
---|---|
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
1 # This code was taken from http://www.mpg123.org |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
4 # Partial 3dnowex-DSP! optimization by Nick Kurshev |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
5 # |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
6 # TODO: optimize scalar 3dnow! code |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
7 # Warning: Phases 7 & 8 are not tested |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
8 # |
736 | 9 |
1277 | 10 .data |
11 .align 8 | |
12 x_plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
13 plus_1f: .float 1.0 | |
14 | |
1173 | 15 .text |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
17 .align 16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
18 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
19 .globl dct64_MMX_3dnowex |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
20 dct64_MMX_3dnowex: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
21 pushl %ebx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
22 pushl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
23 pushl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
24 subl $256,%esp |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
25 movl 280(%esp),%eax |
781 | 26 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
27 leal 128(%esp),%edx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
28 movl 272(%esp),%esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
29 movl 276(%esp),%edi |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
30 movl $costab_mmx,%ebx |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
31 orl %ecx,%ecx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
32 movl %esp,%ecx |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
33 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
34 /* Phase 1*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
35 movq (%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
36 movq 8(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
37 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
38 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
39 movq 120(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
40 movq 112(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
41 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
42 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
43 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
44 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
45 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
46 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
47 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
48 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
49 pfmul (%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
50 pfmul 8(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
51 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
52 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
53 movq %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
54 movq %mm7, 112(%edx) |
781 | 55 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
56 movq 16(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
57 movq 24(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
58 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
59 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
60 movq 104(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
61 movq 96(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
62 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
63 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
64 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
65 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
66 movq %mm0, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
67 movq %mm4, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
68 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
69 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
70 pfmul 16(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
71 pfmul 24(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
72 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
73 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
74 movq %mm3, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
75 movq %mm7, 96(%edx) |
781 | 76 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
77 movq 32(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
78 movq 40(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
79 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
80 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
81 movq 88(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
82 movq 80(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
83 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
84 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
85 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
86 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
87 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
88 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
89 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
90 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
91 pfmul 32(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
92 pfmul 40(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
93 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
94 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
95 movq %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
96 movq %mm7, 80(%edx) |
781 | 97 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
98 movq 48(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
99 movq 56(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
100 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
101 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
102 movq 72(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
103 movq 64(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
104 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
105 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
106 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
107 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
108 movq %mm0, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
109 movq %mm4, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
110 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
111 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
112 pfmul 48(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
113 pfmul 56(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
114 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
115 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
116 movq %mm3, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
117 movq %mm7, 64(%edx) |
781 | 118 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
119 /* Phase 2*/ |
781 | 120 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
121 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
122 movq 8(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
123 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
124 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
125 movq 56(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
126 movq 48(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
127 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
128 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
129 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
130 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
131 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
132 movq %mm4, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
133 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
134 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
135 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
136 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
137 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
138 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
139 movq %mm3, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
140 movq %mm7, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
141 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
142 movq 16(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
143 movq 24(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
144 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
145 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
146 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
147 movq 32(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
148 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
149 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
150 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
151 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
152 movq %mm0, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
153 movq %mm4, 24(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
154 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
155 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
156 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
157 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
158 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
159 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
160 movq %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
161 movq %mm7, 32(%ecx) |
781 | 162 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
163 /* Phase 3*/ |
781 | 164 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
165 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
166 movq 72(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
167 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
168 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
169 movq 120(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
170 movq 112(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
171 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
172 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
173 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
174 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
175 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
176 movq %mm4, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
177 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
178 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
179 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
180 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
181 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
182 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
183 movq %mm3, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
184 movq %mm7, 112(%ecx) |
736 | 185 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
186 movq 80(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
187 movq 88(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
188 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
189 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
190 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
191 movq 96(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
192 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
193 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
194 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
195 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
196 movq %mm0, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
197 movq %mm4, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
198 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
199 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
200 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
201 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
202 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
203 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
204 movq %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
205 movq %mm7, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
206 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
207 /* Phase 4*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
208 |
1277 | 209 movq 96(%ebx), %mm2 |
210 movq 104(%ebx), %mm6 | |
211 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
212 movq (%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
213 movq 8(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
214 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
215 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
216 movq 24(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
217 movq 16(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
218 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
219 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
220 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
221 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
222 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
223 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
224 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
225 pfsub %mm5, %mm7 |
1277 | 226 pfmul %mm2, %mm3 |
227 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
228 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
229 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
230 movq %mm3, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
231 movq %mm7, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
232 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
233 movq 32(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
234 movq 40(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
235 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
236 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
237 movq 56(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
238 movq 48(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
239 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
240 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
241 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
242 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
243 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
244 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
245 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
246 pfsubr %mm5, %mm7 |
1277 | 247 pfmul %mm2, %mm3 |
248 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
249 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
250 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
251 movq %mm3, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
252 movq %mm7, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
253 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
254 movq 64(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
255 movq 72(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
256 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
257 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
258 movq 88(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
259 movq 80(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
260 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
261 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
262 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
263 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
264 movq %mm0, 64(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
265 movq %mm4, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
266 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
267 pfsub %mm5, %mm7 |
1277 | 268 pfmul %mm2, %mm3 |
269 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
270 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
271 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
272 movq %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
273 movq %mm7, 80(%edx) |
736 | 274 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
275 movq 96(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
276 movq 104(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
277 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
278 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
279 movq 120(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
280 movq 112(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
281 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
282 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
283 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
284 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
285 movq %mm0, 96(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
286 movq %mm4, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
287 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
288 pfsubr %mm5, %mm7 |
1277 | 289 pfmul %mm2, %mm3 |
290 pfmul %mm6, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
291 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
292 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
293 movq %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
294 movq %mm7, 112(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
295 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
296 /* Phase 5 */ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
297 |
1277 | 298 movq 112(%ebx), %mm2 |
299 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
300 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
301 movq 16(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
302 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
303 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
304 movq 8(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
305 movq 24(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
306 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
307 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
308 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
309 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
310 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
311 movq %mm4, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
312 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
313 pfsubr %mm5, %mm7 |
1277 | 314 pfmul %mm2, %mm3 |
315 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
316 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
317 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
318 movq %mm3, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
319 movq %mm7, 24(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
320 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
321 movq 32(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
322 movq 48(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
323 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
324 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
325 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
326 movq 56(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
327 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
328 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
329 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
330 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
331 movq %mm0, 32(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
332 movq %mm4, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
333 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
334 pfsubr %mm5, %mm7 |
1277 | 335 pfmul %mm2, %mm3 |
336 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
337 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
338 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
339 movq %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
340 movq %mm7, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
341 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
342 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
343 movq 80(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
344 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
345 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
346 movq 72(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
347 movq 88(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
348 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
349 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
350 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
351 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
352 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
353 movq %mm4, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
354 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
355 pfsubr %mm5, %mm7 |
1277 | 356 pfmul %mm2, %mm3 |
357 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
358 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
359 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
360 movq %mm3, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
361 movq %mm7, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
362 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
363 movq 96(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
364 movq 112(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
365 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
366 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
367 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
368 movq 120(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
369 pswapd %mm1, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
370 pswapd %mm5, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
371 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
372 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
373 movq %mm0, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
374 movq %mm4, 112(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
375 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
376 pfsubr %mm5, %mm7 |
1277 | 377 pfmul %mm2, %mm3 |
378 pfmul %mm2, %mm7 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
379 pswapd %mm3, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
380 pswapd %mm7, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
381 movq %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
382 movq %mm7, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
383 |
1277 | 384 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
385 /* Phase 6. This is the end of easy road. */ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
386 /* Code below is coded in scalar mode. Should be optimized */ |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
387 |
1277 | 388 movd plus_1f, %mm6 |
389 punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ | |
390 movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
391 |
1277 | 392 movq 32(%ecx), %mm0 |
393 movq 64(%ecx), %mm2 | |
394 movq %mm0, %mm1 | |
395 movq %mm2, %mm3 | |
396 pxor %mm7, %mm1 | |
397 pxor %mm7, %mm3 | |
398 pfacc %mm1, %mm0 | |
399 pfacc %mm3, %mm2 | |
400 pfmul %mm6, %mm0 | |
401 pfmul %mm6, %mm2 | |
402 movq %mm0, 32(%edx) | |
403 movq %mm2, 64(%edx) | |
736 | 404 |
1277 | 405 movd 44(%ecx), %mm0 |
406 movd 40(%ecx), %mm2 | |
407 movd 120(%ebx), %mm3 | |
408 punpckldq 76(%ecx), %mm0 | |
409 punpckldq 72(%ecx), %mm2 | |
410 punpckldq %mm3, %mm3 | |
411 movq %mm0, %mm4 | |
412 movq %mm2, %mm5 | |
413 pfsub %mm2, %mm0 | |
414 pfmul %mm3, %mm0 | |
415 movq %mm0, %mm1 | |
416 pfadd %mm5, %mm0 | |
417 pfadd %mm4, %mm0 | |
418 movq %mm0, %mm2 | |
419 punpckldq %mm1, %mm0 | |
420 punpckhdq %mm1, %mm2 | |
421 movq %mm0, 40(%edx) | |
422 movq %mm2, 72(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
423 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
424 movd 48(%ecx), %mm3 |
1282 | 425 movd 60(%ecx), %mm2 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
426 pfsub 52(%ecx), %mm3 |
1282 | 427 pfsub 56(%ecx), %mm2 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
428 pfmul 120(%ebx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
429 pfmul 120(%ebx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
430 movq %mm2, %mm1 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
431 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
432 pfadd 56(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
433 pfadd 60(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
434 movq %mm1, %mm0 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
435 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
436 pfadd 48(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
437 pfadd 52(%ecx), %mm0 |
1277 | 438 pfadd %mm3, %mm1 |
439 punpckldq %mm2, %mm1 | |
440 pfadd %mm3, %mm2 | |
441 punpckldq %mm2, %mm0 | |
442 movq %mm1, 56(%edx) | |
443 movq %mm0, 48(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
444 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
445 /*---*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
446 |
1277 | 447 movd 92(%ecx), %mm1 |
448 pfsub 88(%ecx), %mm1 | |
449 pfmul 120(%ebx), %mm1 | |
450 movd %mm1, 92(%edx) | |
451 pfadd 92(%ecx), %mm1 | |
452 pfadd 88(%ecx), %mm1 | |
453 movq %mm1, %mm0 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
454 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
455 pfadd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
456 pfadd 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
457 movd %mm0, 80(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
458 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
459 movd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
460 pfsub 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
461 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
462 pfadd %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
463 pfadd 92(%edx), %mm0 |
1282 | 464 punpckldq %mm1, %mm0 |
465 movq %mm0, 84(%edx) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
466 |
1277 | 467 movq 96(%ecx), %mm0 |
468 movq %mm0, %mm1 | |
469 pxor %mm7, %mm1 | |
470 pfacc %mm1, %mm0 | |
471 pfmul %mm6, %mm0 | |
472 movq %mm0, 96(%edx) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
473 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
474 movd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
475 pfsub 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
476 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
477 movd %mm0, 108(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
478 pfadd 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
479 pfadd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
480 movd %mm0, 104(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
481 |
1277 | 482 movd 124(%ecx), %mm1 |
483 pfsub 120(%ecx), %mm1 | |
484 pfmul 120(%ebx), %mm1 | |
485 movd %mm1, 124(%edx) | |
486 pfadd 120(%ecx), %mm1 | |
487 pfadd 124(%ecx), %mm1 | |
488 movq %mm1, %mm0 | |
736 | 489 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
490 pfadd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
491 pfadd 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
492 movd %mm0, 112(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
493 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
494 movd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
495 pfsub 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
496 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
497 pfadd %mm0,%mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
498 pfadd 124(%edx), %mm0 |
1282 | 499 punpckldq %mm1, %mm0 |
500 movq %mm0, 116(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
501 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
502 jnz .L01 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
503 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
504 /* Phase 7*/ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
505 /* Code below is coded in scalar mode. Should be optimized */ |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
506 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
507 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
508 pfadd 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
509 movd %mm0, 1024(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
510 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
511 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
512 pfsub 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
513 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
514 movd %mm0, (%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
515 movd %mm0, (%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
516 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
517 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
518 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
519 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
520 movd %mm0, 512(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
521 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
522 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
523 movd %mm0, 512(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
524 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
525 movd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
526 pfsub 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
527 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
528 movq %mm0, %mm3 |
781 | 529 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
530 movd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
531 pfsub 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
532 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
533 movd %mm0, 768(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
534 movq %mm0, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
535 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
536 pfadd 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
537 pfadd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
538 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
539 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
540 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
541 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
542 movd %mm0, 768(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
543 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
544 movd %mm1, 256(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
545 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
546 movd %mm2, 256(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
547 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
548 /* Phase 8*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
549 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
550 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
551 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
552 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
553 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
554 movd %mm0, 896(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
555 movd %mm1, 640(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
556 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
557 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
558 movd %mm0, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
559 movd %mm1, 384(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
560 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
561 movd 40(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
562 pfadd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
563 movd %mm0, 384(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
564 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
565 movd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
566 pfadd 36(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
567 movd %mm0, 128(%esi) |
781 | 568 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
569 movd 60(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
570 movd %mm0, 896(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
571 pfadd 44(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
572 movd %mm0, 640(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
573 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
574 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
575 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
576 movq 104(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
577 pfadd 112(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
578 pfadd 104(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
579 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
580 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
581 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
582 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
583 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
584 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
585 pfadd 72(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
586 movd %mm0, 960(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
587 movd %mm2, 704(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
588 movd %mm4, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
589 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
590 psrlq $32, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
591 psrlq $32, %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
592 movd %mm0, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
593 movd %mm2, 320(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
594 movd %mm4, 576(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
595 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
596 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
597 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
598 movd %mm1, 832(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
599 movd %mm3, 576(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
600 movd %mm5, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
601 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
602 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
603 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
604 movd %mm1, 192(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
605 movd %mm3, 448(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
606 movd %mm5, 704(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
607 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
608 movd 120(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
609 pfadd 100(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
610 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
611 pfadd 88(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
612 movd %mm0, 192(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
613 pfadd 68(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
614 movd %mm1, 64(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
615 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
616 movd 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
617 movd %mm0, 960(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
618 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
619 movd %mm0, 832(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
620 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
621 jmp .L_bye |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
622 .L01: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
623 /* Phase 9*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
624 |
1277 | 625 movq (%ecx), %mm0 |
626 movq %mm0, %mm1 | |
627 pxor %mm7, %mm1 | |
628 pfacc %mm1, %mm0 | |
629 pfmul %mm6, %mm0 | |
630 pf2id %mm0, %mm0 | |
631 movd %mm0, %eax | |
632 movw %ax, 512(%esi) | |
633 psrlq $32, %mm0 | |
634 movd %mm0, %eax | |
635 movw %ax, (%esi) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
636 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
637 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
638 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
639 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
640 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
641 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
642 movw %ax, 256(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
643 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
644 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
645 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
646 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
647 movw %ax, 256(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
648 |
1277 | 649 movd 16(%ecx), %mm3 |
650 pfsub 20(%ecx), %mm3 | |
651 pfmul 120(%ebx), %mm3 | |
652 movq %mm3, %mm2 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
653 |
1277 | 654 movd 28(%ecx), %mm2 |
655 pfsub 24(%ecx), %mm2 | |
656 pfmul 120(%ebx), %mm2 | |
657 movq %mm2, %mm1 | |
658 | |
659 pf2id %mm2, %mm7 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
660 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
661 movw %ax, 384(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
662 |
1277 | 663 pfadd 24(%ecx), %mm1 |
664 pfadd 28(%ecx), %mm1 | |
665 movq %mm1, %mm0 | |
666 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
667 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
668 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
669 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
670 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
671 movw %ax, 384(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
672 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
673 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
674 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
675 movw %ax, 128(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
676 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
677 pf2id %mm2, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
678 movd %mm2, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
679 movw %ax, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
680 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
681 /* Phase 10*/ |
736 | 682 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
683 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
684 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
685 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
686 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
687 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
688 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
689 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
690 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
691 movw %ax, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
692 movw %cx, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
693 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
694 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
695 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
696 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
697 movw %ax, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
698 movw %cx, 192(%edi) |
736 | 699 |
1277 | 700 movd 40(%edx), %mm3 |
701 movd 56(%edx), %mm4 | |
702 movd 60(%edx), %mm0 | |
703 movd 44(%edx), %mm2 | |
704 movd 120(%edx), %mm5 | |
705 punpckldq %mm4, %mm3 | |
706 punpckldq 124(%edx), %mm0 | |
707 pfadd 100(%edx), %mm5 | |
708 punpckldq 36(%edx), %mm4 | |
709 punpckldq 92(%edx), %mm2 | |
710 movq %mm5, %mm6 | |
711 pfadd %mm4, %mm3 | |
712 pf2id %mm0, %mm1 | |
713 pf2id %mm3, %mm3 | |
714 pfadd 88(%edx), %mm5 | |
715 movd %mm1, %eax | |
716 movd %mm3, %ecx | |
717 movw %ax, 448(%edi) | |
718 movw %cx, 192(%esi) | |
719 pf2id %mm5, %mm5 | |
720 psrlq $32, %mm1 | |
721 psrlq $32, %mm3 | |
722 movd %mm5, %ebx | |
723 movd %mm1, %eax | |
724 movd %mm3, %ecx | |
725 movw %bx, 96(%esi) | |
726 movw %ax, 480(%edi) | |
727 movw %cx, 64(%esi) | |
728 pfadd %mm2, %mm0 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
729 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
730 movd %mm0, %eax |
1277 | 731 pfadd 68(%edx), %mm6 |
732 movw %ax, 320(%edi) | |
733 psrlq $32, %mm0 | |
734 pf2id %mm6, %mm6 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
735 movd %mm0, %eax |
1277 | 736 movd %mm6, %ebx |
737 movw %ax, 416(%edi) | |
738 movw %bx, 32(%esi) | |
736 | 739 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
740 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
741 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
742 movq 104(%edx), %mm4 |
1277 | 743 pfadd %mm2, %mm0 |
744 pfadd %mm4, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
745 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
746 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
747 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
748 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
749 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
750 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
751 pfadd 72(%edx), %mm4 |
1277 | 752 pf2id %mm0, %mm0 |
753 pf2id %mm2, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
754 pf2id %mm4, %mm4 |
1277 | 755 movd %mm0, %eax |
756 movd %mm2, %ecx | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
757 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
758 movw %ax, 480(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
759 movw %cx, 352(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
760 movw %bx, 224(%esi) |
1277 | 761 psrlq $32, %mm0 |
762 psrlq $32, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
763 psrlq $32, %mm4 |
1277 | 764 movd %mm0, %eax |
765 movd %mm2, %ecx | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
766 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
767 movw %ax, 32(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
768 movw %cx, 160(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
769 movw %bx, 288(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
770 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
771 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
772 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
773 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
774 pf2id %mm3, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
775 pf2id %mm5, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
776 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
777 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
778 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
779 movw %ax, 416(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
780 movw %cx, 288(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
781 movw %bx, 160(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
782 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
783 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
784 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
785 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
786 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
787 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
788 movw %ax, 96(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
789 movw %cx, 224(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
790 movw %bx, 352(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
791 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
792 movsw |
736 | 793 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
794 .L_bye: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
795 addl $256,%esp |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
796 femms |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
797 popl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
798 popl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1173
diff
changeset
|
799 popl %ebx |
1282 | 800 ret $12 |