Mercurial > mplayer.hg
annotate mp3lib/dct64_3dnow.s @ 2316:bcb229557e9b
fixed alignment (static variables where sometimes not 8-byte aligned)
added half uv interpolation support
added prefetch
BGR15 support in MMX (untested) (so BGR15,16,24,32 are supported)
special unscaled height version (not much faster but it doesnt interpolate uv vertically)
author | michael |
---|---|
date | Sat, 20 Oct 2001 21:12:09 +0000 |
parents | 175423b2691e |
children |
rev | line source |
---|---|
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
1 # This code was taken from http://www.mpg123.org |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
4 # Partial 3dnow! optimization by Nick Kurshev |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
5 # |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
6 # TODO: optimize scalar 3dnow! code |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
7 # Warning: Phases 7 & 8 are not tested |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
8 # |
1 | 9 |
1277 | 10 .data |
11 .align 8 | |
12 x_plus_minus_3dnow: .long 0x00000000, 0x80000000 | |
13 plus_1f: .float 1.0 | |
14 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
15 .text |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
17 .align 16 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
18 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
19 .globl dct64_MMX_3dnow |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
20 dct64_MMX_3dnow: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
21 pushl %ebx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
22 pushl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
23 pushl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
24 subl $256,%esp |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
25 movl 280(%esp),%eax |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
26 leal 128(%esp),%edx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
27 movl 272(%esp),%esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
28 movl 276(%esp),%edi |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
29 movl $costab_mmx,%ebx |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
30 orl %ecx,%ecx |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
31 movl %esp,%ecx |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
32 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
33 /* Phase 1*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
34 movq (%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
35 movq 8(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
36 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
37 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
38 movq 120(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
39 movq 112(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
40 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
41 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
42 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
43 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
44 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
45 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
46 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
47 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
48 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
49 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
50 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
51 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
52 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
53 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
54 pfmul (%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
55 pfmul 8(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
56 movd %mm3, 124(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
57 movd %mm7, 116(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
58 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
59 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
60 movd %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
61 movd %mm7, 112(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
62 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
63 movq 16(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
64 movq 24(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
65 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
66 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
67 movq 104(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
68 movq 96(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
69 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
70 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
71 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
72 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
73 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
74 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
75 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
76 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
77 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
78 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
79 movq %mm0, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
80 movq %mm4, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
81 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
82 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
83 pfmul 16(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
84 pfmul 24(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
85 movd %mm3, 108(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
86 movd %mm7, 100(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
87 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
88 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
89 movd %mm3, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
90 movd %mm7, 96(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
91 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
92 movq 32(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
93 movq 40(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
94 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
95 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
96 movq 88(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
97 movq 80(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
98 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
99 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
100 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
101 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
102 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
103 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
104 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
105 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
106 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
107 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
108 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
109 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
110 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
111 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
112 pfmul 32(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
113 pfmul 40(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
114 movd %mm3, 92(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
115 movd %mm7, 84(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
116 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
117 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
118 movd %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
119 movd %mm7, 80(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
120 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
121 movq 48(%eax), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
122 movq 56(%eax), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
123 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
124 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
125 movq 72(%eax), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
126 movq 64(%eax), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
127 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
128 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
129 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
130 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
131 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
132 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
133 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
134 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
135 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
136 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
137 movq %mm0, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
138 movq %mm4, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
139 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
140 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
141 pfmul 48(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
142 pfmul 56(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
143 movd %mm3, 76(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
144 movd %mm7, 68(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
145 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
146 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
147 movd %mm3, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
148 movd %mm7, 64(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
149 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
150 /* Phase 2*/ |
1 | 151 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
152 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
153 movq 8(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
154 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
155 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
156 movq 56(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
157 movq 48(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
158 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
159 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
160 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
161 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
162 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
163 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
164 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
165 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
166 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
167 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
168 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
169 movq %mm4, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
170 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
171 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
172 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
173 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
174 movd %mm3, 60(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
175 movd %mm7, 52(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
176 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
177 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
178 movd %mm3, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
179 movd %mm7, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
180 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
181 movq 16(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
182 movq 24(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
183 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
184 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
185 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
186 movq 32(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
187 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
188 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
189 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
190 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
191 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
192 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
193 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
194 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
195 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
196 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
197 movq %mm0, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
198 movq %mm4, 24(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
199 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
200 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
201 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
202 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
203 movd %mm3, 44(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
204 movd %mm7, 36(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
205 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
206 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
207 movd %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
208 movd %mm7, 32(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
209 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
210 /* Phase 3*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
211 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
212 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
213 movq 72(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
214 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
215 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
216 movq 120(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
217 movq 112(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
218 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
219 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
220 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
221 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
222 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
223 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
224 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
225 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
226 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
227 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
228 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
229 movq %mm4, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
230 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
231 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
232 pfmul 64(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
233 pfmul 72(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
234 movd %mm3, 124(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
235 movd %mm7, 116(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
236 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
237 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
238 movd %mm3, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
239 movd %mm7, 112(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
240 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
241 movq 80(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
242 movq 88(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
243 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
244 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
245 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
246 movq 96(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
247 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
248 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
249 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
250 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
251 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
252 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
253 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
254 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
255 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
256 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
257 movq %mm0, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
258 movq %mm4, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
259 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
260 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
261 pfmul 80(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
262 pfmul 88(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
263 movd %mm3, 108(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
264 movd %mm7, 100(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
265 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
266 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
267 movd %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
268 movd %mm7, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
269 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
270 /* Phase 4*/ |
1 | 271 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
272 movq (%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
273 movq 8(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
274 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
275 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
276 movq 24(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
277 movq 16(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
278 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
279 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
280 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
281 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
282 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
283 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
284 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
285 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
286 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
287 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
288 movq %mm0, (%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
289 movq %mm4, 8(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
290 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
291 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
292 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
293 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
294 movd %mm3, 28(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
295 movd %mm7, 20(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
296 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
297 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
298 movd %mm3, 24(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
299 movd %mm7, 16(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
300 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
301 movq 32(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
302 movq 40(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
303 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
304 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
305 movq 56(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
306 movq 48(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
307 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
308 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
309 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
310 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
311 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
312 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
313 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
314 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
315 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
316 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
317 movq %mm0, 32(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
318 movq %mm4, 40(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
319 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
320 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
321 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
322 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
323 movd %mm3, 60(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
324 movd %mm7, 52(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
325 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
326 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
327 movd %mm3, 56(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
328 movd %mm7, 48(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
329 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
330 movq 64(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
331 movq 72(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
332 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
333 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
334 movq 88(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
335 movq 80(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
336 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
337 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
338 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
339 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
340 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
341 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
342 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
343 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
344 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
345 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
346 movq %mm0, 64(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
347 movq %mm4, 72(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
348 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
349 pfsub %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
350 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
351 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
352 movd %mm3, 92(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
353 movd %mm7, 84(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
354 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
355 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
356 movd %mm3, 88(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
357 movd %mm7, 80(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
358 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
359 movq 96(%ecx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
360 movq 104(%ecx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
361 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
362 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
363 movq 120(%ecx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
364 movq 112(%ecx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
365 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
366 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
367 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
368 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
369 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
370 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
371 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
372 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
373 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
374 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
375 movq %mm0, 96(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
376 movq %mm4, 104(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
377 pfsubr %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
378 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
379 pfmul 96(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
380 pfmul 104(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
381 movd %mm3, 124(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
382 movd %mm7, 116(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
383 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
384 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
385 movd %mm3, 120(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
386 movd %mm7, 112(%edx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
387 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
388 /* Phase 5 */ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
389 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
390 movq (%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
391 movq 16(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
392 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
393 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
394 movq 8(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
395 movq 24(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
396 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
397 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
398 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
399 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
400 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
401 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
402 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
403 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
404 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
405 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
406 movq %mm0, (%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
407 movq %mm4, 16(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
408 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
409 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
410 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
411 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
412 movd %mm3, 12(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
413 movd %mm7, 28(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
414 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
415 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
416 movd %mm3, 8(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
417 movd %mm7, 24(%ecx) |
1 | 418 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
419 movq 32(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
420 movq 48(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
421 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
422 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
423 movq 40(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
424 movq 56(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
425 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
426 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
427 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
428 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
429 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
430 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
431 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
432 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
433 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
434 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
435 movq %mm0, 32(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
436 movq %mm4, 48(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
437 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
438 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
439 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
440 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
441 movd %mm3, 44(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
442 movd %mm7, 60(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
443 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
444 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
445 movd %mm3, 40(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
446 movd %mm7, 56(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
447 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
448 movq 64(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
449 movq 80(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
450 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
451 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
452 movq 72(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
453 movq 88(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
454 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
455 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
456 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
457 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
458 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
459 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
460 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
461 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
462 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
463 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
464 movq %mm0, 64(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
465 movq %mm4, 80(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
466 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
467 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
468 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
469 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
470 movd %mm3, 76(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
471 movd %mm7, 92(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
472 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
473 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
474 movd %mm3, 72(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
475 movd %mm7, 88(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
476 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
477 movq 96(%edx), %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
478 movq 112(%edx), %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
479 movq %mm0, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
480 movq %mm4, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
481 movq 104(%edx), %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
482 movq 120(%edx), %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
483 /* n.b.: pswapd*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
484 movq %mm1, %mm2 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
485 movq %mm5, %mm6 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
486 psrlq $32, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
487 psrlq $32, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
488 punpckldq %mm2, %mm1 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
489 punpckldq %mm6, %mm5 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
490 /**/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
491 pfadd %mm1, %mm0 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
492 pfadd %mm5, %mm4 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
493 movq %mm0, 96(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
494 movq %mm4, 112(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
495 pfsub %mm1, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
496 pfsubr %mm5, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
497 pfmul 112(%ebx), %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
498 pfmul 112(%ebx), %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
499 movd %mm3, 108(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
500 movd %mm7, 124(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
501 psrlq $32, %mm3 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
502 psrlq $32, %mm7 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
503 movd %mm3, 104(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
504 movd %mm7, 120(%ecx) |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
505 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
506 /* Phase 6. This is the end of easy road. */ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
507 /* Code below is coded in scalar mode. Should be optimized */ |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
508 |
1277 | 509 movd plus_1f, %mm6 |
510 punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/ | |
511 movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */ | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
512 |
1277 | 513 movq 32(%ecx), %mm0 |
514 movq 64(%ecx), %mm2 | |
515 movq %mm0, %mm1 | |
516 movq %mm2, %mm3 | |
517 pxor %mm7, %mm1 | |
518 pxor %mm7, %mm3 | |
519 pfacc %mm1, %mm0 | |
520 pfacc %mm3, %mm2 | |
521 pfmul %mm6, %mm0 | |
522 pfmul %mm6, %mm2 | |
523 movq %mm0, 32(%edx) | |
524 movq %mm2, 64(%edx) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
525 |
1277 | 526 movd 44(%ecx), %mm0 |
527 movd 40(%ecx), %mm2 | |
528 movd 120(%ebx), %mm3 | |
529 punpckldq 76(%ecx), %mm0 | |
530 punpckldq 72(%ecx), %mm2 | |
531 punpckldq %mm3, %mm3 | |
532 movq %mm0, %mm4 | |
533 movq %mm2, %mm5 | |
534 pfsub %mm2, %mm0 | |
535 pfmul %mm3, %mm0 | |
536 movq %mm0, %mm1 | |
537 pfadd %mm5, %mm0 | |
538 pfadd %mm4, %mm0 | |
539 movq %mm0, %mm2 | |
540 punpckldq %mm1, %mm0 | |
541 punpckhdq %mm1, %mm2 | |
542 movq %mm0, 40(%edx) | |
543 movq %mm2, 72(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
544 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
545 movd 48(%ecx), %mm3 |
1282 | 546 movd 60(%ecx), %mm2 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
547 pfsub 52(%ecx), %mm3 |
1282 | 548 pfsub 56(%ecx), %mm2 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
549 pfmul 120(%ebx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
550 pfmul 120(%ebx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
551 movq %mm2, %mm1 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
552 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
553 pfadd 56(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
554 pfadd 60(%ecx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
555 movq %mm1, %mm0 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
556 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
557 pfadd 48(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
558 pfadd 52(%ecx), %mm0 |
1277 | 559 pfadd %mm3, %mm1 |
560 punpckldq %mm2, %mm1 | |
561 pfadd %mm3, %mm2 | |
562 punpckldq %mm2, %mm0 | |
563 movq %mm1, 56(%edx) | |
564 movq %mm0, 48(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
565 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
566 /*---*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
567 |
1277 | 568 movd 92(%ecx), %mm1 |
569 pfsub 88(%ecx), %mm1 | |
570 pfmul 120(%ebx), %mm1 | |
571 movd %mm1, 92(%edx) | |
572 pfadd 92(%ecx), %mm1 | |
573 pfadd 88(%ecx), %mm1 | |
574 movq %mm1, %mm0 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
575 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
576 pfadd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
577 pfadd 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
578 movd %mm0, 80(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
579 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
580 movd 80(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
581 pfsub 84(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
582 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
583 pfadd %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
584 pfadd 92(%edx), %mm0 |
1282 | 585 punpckldq %mm1, %mm0 |
586 movq %mm0, 84(%edx) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
587 |
1277 | 588 movq 96(%ecx), %mm0 |
589 movq %mm0, %mm1 | |
590 pxor %mm7, %mm1 | |
591 pfacc %mm1, %mm0 | |
592 pfmul %mm6, %mm0 | |
593 movq %mm0, 96(%edx) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
594 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
595 movd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
596 pfsub 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
597 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
598 movd %mm0, 108(%edx) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
599 pfadd 104(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
600 pfadd 108(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
601 movd %mm0, 104(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
602 |
1277 | 603 movd 124(%ecx), %mm1 |
604 pfsub 120(%ecx), %mm1 | |
605 pfmul 120(%ebx), %mm1 | |
606 movd %mm1, 124(%edx) | |
607 pfadd 120(%ecx), %mm1 | |
608 pfadd 124(%ecx), %mm1 | |
609 movq %mm1, %mm0 | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
610 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
611 pfadd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
612 pfadd 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
613 movd %mm0, 112(%edx) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
614 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
615 movd 112(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
616 pfsub 116(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
617 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
618 pfadd %mm0,%mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
619 pfadd 124(%edx), %mm0 |
1282 | 620 punpckldq %mm1, %mm0 |
621 movq %mm0, 116(%edx) | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
622 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
623 jnz .L01 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
624 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
625 /* Phase 7*/ |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
626 /* Code below is coded in scalar mode. Should be optimized */ |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
627 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
628 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
629 pfadd 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
630 movd %mm0, 1024(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
631 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
632 movd (%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
633 pfsub 4(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
634 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
635 movd %mm0, (%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
636 movd %mm0, (%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
637 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
638 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
639 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
640 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
641 movd %mm0, 512(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
642 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
643 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
644 movd %mm0, 512(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
645 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
646 movd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
647 pfsub 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
648 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
649 movq %mm0, %mm3 |
1 | 650 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
651 movd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
652 pfsub 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
653 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
654 movd %mm0, 768(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
655 movq %mm0, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
656 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
657 pfadd 24(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
658 pfadd 28(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
659 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
660 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
661 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
662 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
663 movd %mm0, 768(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
664 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
665 movd %mm1, 256(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
666 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
667 movd %mm2, 256(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
668 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
669 /* Phase 8*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
670 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
671 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
672 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
673 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
674 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
675 movd %mm0, 896(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
676 movd %mm1, 640(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
677 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
678 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
679 movd %mm0, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
680 movd %mm1, 384(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
681 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
682 movd 40(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
683 pfadd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
684 movd %mm0, 384(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
685 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
686 movd 56(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
687 pfadd 36(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
688 movd %mm0, 128(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
689 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
690 movd 60(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
691 movd %mm0, 896(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
692 pfadd 44(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
693 movd %mm0, 640(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
694 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
695 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
696 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
697 movq 104(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
698 pfadd 112(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
699 pfadd 104(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
700 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
701 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
702 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
703 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
704 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
705 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
706 pfadd 72(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
707 movd %mm0, 960(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
708 movd %mm2, 704(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
709 movd %mm4, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
710 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
711 psrlq $32, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
712 psrlq $32, %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
713 movd %mm0, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
714 movd %mm2, 320(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
715 movd %mm4, 576(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
716 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
717 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
718 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
719 movd %mm1, 832(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
720 movd %mm3, 576(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
721 movd %mm5, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
722 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
723 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
724 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
725 movd %mm1, 192(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
726 movd %mm3, 448(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
727 movd %mm5, 704(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
728 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
729 movd 120(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
730 pfadd 100(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
731 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
732 pfadd 88(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
733 movd %mm0, 192(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
734 pfadd 68(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
735 movd %mm1, 64(%esi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
736 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
737 movd 124(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
738 movd %mm0, 960(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
739 pfadd 92(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
740 movd %mm0, 832(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
741 |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
742 jmp .L_bye |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
743 .L01: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
744 /* Phase 9*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
745 |
1277 | 746 movq (%ecx), %mm0 |
747 movq %mm0, %mm1 | |
748 pxor %mm7, %mm1 | |
749 pfacc %mm1, %mm0 | |
750 pfmul %mm6, %mm0 | |
751 pf2id %mm0, %mm0 | |
752 movd %mm0, %eax | |
753 movw %ax, 512(%esi) | |
754 psrlq $32, %mm0 | |
755 movd %mm0, %eax | |
756 movw %ax, (%esi) | |
1 | 757 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
758 movd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
759 pfsub 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
760 pfmul 120(%ebx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
761 pf2id %mm0, %mm7 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
762 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
763 movw %ax, 256(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
764 pfadd 12(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
765 pfadd 8(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
766 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
767 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
768 movw %ax, 256(%esi) |
1 | 769 |
1277 | 770 movd 16(%ecx), %mm3 |
771 pfsub 20(%ecx), %mm3 | |
772 pfmul 120(%ebx), %mm3 | |
773 movq %mm3, %mm2 | |
1 | 774 |
1277 | 775 movd 28(%ecx), %mm2 |
776 pfsub 24(%ecx), %mm2 | |
777 pfmul 120(%ebx), %mm2 | |
778 movq %mm2, %mm1 | |
779 | |
780 pf2id %mm2, %mm7 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
781 movd %mm7, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
782 movw %ax, 384(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
783 |
1277 | 784 pfadd 24(%ecx), %mm1 |
785 pfadd 28(%ecx), %mm1 | |
786 movq %mm1, %mm0 | |
787 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
788 pfadd 16(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
789 pfadd 20(%ecx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
790 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
791 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
792 movw %ax, 384(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
793 pfadd %mm3, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
794 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
795 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
796 movw %ax, 128(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
797 pfadd %mm3, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
798 pf2id %mm2, %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
799 movd %mm2, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
800 movw %ax, 128(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
801 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
802 /* Phase 10*/ |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
803 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
804 movq 32(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
805 movq 48(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
806 pfadd 48(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
807 pfadd 40(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
808 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
809 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
810 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
811 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
812 movw %ax, 448(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
813 movw %cx, 320(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
814 psrlq $32, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
815 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
816 movd %mm0, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
817 movd %mm1, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
818 movw %ax, 64(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
819 movw %cx, 192(%edi) |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
820 |
1277 | 821 movd 40(%edx), %mm3 |
822 movd 56(%edx), %mm4 | |
823 movd 60(%edx), %mm0 | |
824 movd 44(%edx), %mm2 | |
825 movd 120(%edx), %mm5 | |
826 punpckldq %mm4, %mm3 | |
827 punpckldq 124(%edx), %mm0 | |
828 pfadd 100(%edx), %mm5 | |
829 punpckldq 36(%edx), %mm4 | |
830 punpckldq 92(%edx), %mm2 | |
831 movq %mm5, %mm6 | |
832 pfadd %mm4, %mm3 | |
833 pf2id %mm0, %mm1 | |
834 pf2id %mm3, %mm3 | |
835 pfadd 88(%edx), %mm5 | |
836 movd %mm1, %eax | |
837 movd %mm3, %ecx | |
838 movw %ax, 448(%edi) | |
839 movw %cx, 192(%esi) | |
840 pf2id %mm5, %mm5 | |
841 psrlq $32, %mm1 | |
842 psrlq $32, %mm3 | |
843 movd %mm5, %ebx | |
844 movd %mm1, %eax | |
845 movd %mm3, %ecx | |
846 movw %bx, 96(%esi) | |
847 movw %ax, 480(%edi) | |
848 movw %cx, 64(%esi) | |
849 pfadd %mm2, %mm0 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
850 pf2id %mm0, %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
851 movd %mm0, %eax |
1277 | 852 pfadd 68(%edx), %mm6 |
853 movw %ax, 320(%edi) | |
854 psrlq $32, %mm0 | |
855 pf2id %mm6, %mm6 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
856 movd %mm0, %eax |
1277 | 857 movd %mm6, %ebx |
858 movw %ax, 416(%edi) | |
859 movw %bx, 32(%esi) | |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
860 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
861 movq 96(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
862 movq 112(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
863 movq 104(%edx), %mm4 |
1277 | 864 pfadd %mm2, %mm0 |
865 pfadd %mm4, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
866 pfadd 120(%edx), %mm4 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
867 movq %mm0, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
868 movq %mm2, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
869 movq %mm4, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
870 pfadd 64(%edx), %mm0 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
871 pfadd 80(%edx), %mm2 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
872 pfadd 72(%edx), %mm4 |
1277 | 873 pf2id %mm0, %mm0 |
874 pf2id %mm2, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
875 pf2id %mm4, %mm4 |
1277 | 876 movd %mm0, %eax |
877 movd %mm2, %ecx | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
878 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
879 movw %ax, 480(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
880 movw %cx, 352(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
881 movw %bx, 224(%esi) |
1277 | 882 psrlq $32, %mm0 |
883 psrlq $32, %mm2 | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
884 psrlq $32, %mm4 |
1277 | 885 movd %mm0, %eax |
886 movd %mm2, %ecx | |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
887 movd %mm4, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
888 movw %ax, 32(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
889 movw %cx, 160(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
890 movw %bx, 288(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
891 pfadd 80(%edx), %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
892 pfadd 72(%edx), %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
893 pfadd 88(%edx), %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
894 pf2id %mm1, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
895 pf2id %mm3, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
896 pf2id %mm5, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
897 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
898 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
899 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
900 movw %ax, 416(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
901 movw %cx, 288(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
902 movw %bx, 160(%esi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
903 psrlq $32, %mm1 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
904 psrlq $32, %mm3 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
905 psrlq $32, %mm5 |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
906 movd %mm1, %eax |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
907 movd %mm3, %ecx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
908 movd %mm5, %ebx |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
909 movw %ax, 96(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
910 movw %cx, 224(%edi) |
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
911 movw %bx, 352(%edi) |
1 | 912 |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
913 movsw |
1 | 914 |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
915 .L_bye: |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
916 addl $256,%esp |
1271
2864e32cd267
Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents:
1245
diff
changeset
|
917 femms |
1245
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
918 popl %edi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
919 popl %esi |
03b7e2955a20
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents:
1
diff
changeset
|
920 popl %ebx |
1282 | 921 ret $12 |