annotate mp3lib/dct64_3dnow.s @ 2316:bcb229557e9b

fixed alignment (static variables where sometimes not 8-byte aligned) added half uv interpolation support added prefetch BGR15 support in MMX (untested) (so BGR15,16,24,32 are supported) special unscaled height version (not much faster but it doesnt interpolate uv vertically)
author michael
date Sat, 20 Oct 2001 21:12:09 +0000
parents 175423b2691e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
1 # This code was taken from http://www.mpg123.org
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
2 # See ChangeLog of mpg123-0.59s-pre.1 for detail
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
3 # Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
4 # Partial 3dnow! optimization by Nick Kurshev
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
5 #
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
6 # TODO: optimize scalar 3dnow! code
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
7 # Warning: Phases 7 & 8 are not tested
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
8 #
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
9
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
10 .data
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
11 .align 8
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
12 x_plus_minus_3dnow: .long 0x00000000, 0x80000000
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
13 plus_1f: .float 1.0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
14
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
15 .text
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
16
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
17 .align 16
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
18
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
19 .globl dct64_MMX_3dnow
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
20 dct64_MMX_3dnow:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
21 pushl %ebx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
22 pushl %esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
23 pushl %edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
24 subl $256,%esp
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
25 movl 280(%esp),%eax
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
26 leal 128(%esp),%edx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
27 movl 272(%esp),%esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
28 movl 276(%esp),%edi
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
29 movl $costab_mmx,%ebx
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
30 orl %ecx,%ecx
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
31 movl %esp,%ecx
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
32
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
33 /* Phase 1*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
34 movq (%eax), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
35 movq 8(%eax), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
36 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
37 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
38 movq 120(%eax), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
39 movq 112(%eax), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
40 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
41 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
42 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
43 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
44 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
45 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
46 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
47 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
48 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
49 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
50 movq %mm0, (%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
51 movq %mm4, 8(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
52 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
53 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
54 pfmul (%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
55 pfmul 8(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
56 movd %mm3, 124(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
57 movd %mm7, 116(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
58 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
59 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
60 movd %mm3, 120(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
61 movd %mm7, 112(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
62
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
63 movq 16(%eax), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
64 movq 24(%eax), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
65 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
66 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
67 movq 104(%eax), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
68 movq 96(%eax), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
69 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
70 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
71 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
72 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
73 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
74 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
75 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
76 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
77 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
78 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
79 movq %mm0, 16(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
80 movq %mm4, 24(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
81 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
82 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
83 pfmul 16(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
84 pfmul 24(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
85 movd %mm3, 108(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
86 movd %mm7, 100(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
87 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
88 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
89 movd %mm3, 104(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
90 movd %mm7, 96(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
91
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
92 movq 32(%eax), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
93 movq 40(%eax), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
94 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
95 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
96 movq 88(%eax), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
97 movq 80(%eax), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
98 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
99 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
100 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
101 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
102 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
103 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
104 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
105 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
106 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
107 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
108 movq %mm0, 32(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
109 movq %mm4, 40(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
110 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
111 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
112 pfmul 32(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
113 pfmul 40(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
114 movd %mm3, 92(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
115 movd %mm7, 84(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
116 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
117 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
118 movd %mm3, 88(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
119 movd %mm7, 80(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
120
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
121 movq 48(%eax), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
122 movq 56(%eax), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
123 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
124 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
125 movq 72(%eax), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
126 movq 64(%eax), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
127 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
128 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
129 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
130 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
131 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
132 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
133 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
134 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
135 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
136 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
137 movq %mm0, 48(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
138 movq %mm4, 56(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
139 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
140 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
141 pfmul 48(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
142 pfmul 56(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
143 movd %mm3, 76(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
144 movd %mm7, 68(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
145 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
146 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
147 movd %mm3, 72(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
148 movd %mm7, 64(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
149
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
150 /* Phase 2*/
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
151
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
152 movq (%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
153 movq 8(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
154 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
155 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
156 movq 56(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
157 movq 48(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
158 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
159 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
160 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
161 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
162 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
163 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
164 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
165 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
166 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
167 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
168 movq %mm0, (%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
169 movq %mm4, 8(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
170 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
171 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
172 pfmul 64(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
173 pfmul 72(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
174 movd %mm3, 60(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
175 movd %mm7, 52(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
176 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
177 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
178 movd %mm3, 56(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
179 movd %mm7, 48(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
180
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
181 movq 16(%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
182 movq 24(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
183 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
184 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
185 movq 40(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
186 movq 32(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
187 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
188 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
189 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
190 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
191 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
192 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
193 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
194 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
195 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
196 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
197 movq %mm0, 16(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
198 movq %mm4, 24(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
199 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
200 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
201 pfmul 80(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
202 pfmul 88(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
203 movd %mm3, 44(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
204 movd %mm7, 36(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
205 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
206 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
207 movd %mm3, 40(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
208 movd %mm7, 32(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
209
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
210 /* Phase 3*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
211
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
212 movq 64(%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
213 movq 72(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
214 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
215 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
216 movq 120(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
217 movq 112(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
218 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
219 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
220 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
221 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
222 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
223 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
224 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
225 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
226 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
227 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
228 movq %mm0, 64(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
229 movq %mm4, 72(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
230 pfsubr %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
231 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
232 pfmul 64(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
233 pfmul 72(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
234 movd %mm3, 124(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
235 movd %mm7, 116(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
236 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
237 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
238 movd %mm3, 120(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
239 movd %mm7, 112(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
240
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
241 movq 80(%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
242 movq 88(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
243 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
244 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
245 movq 104(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
246 movq 96(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
247 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
248 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
249 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
250 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
251 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
252 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
253 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
254 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
255 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
256 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
257 movq %mm0, 80(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
258 movq %mm4, 88(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
259 pfsubr %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
260 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
261 pfmul 80(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
262 pfmul 88(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
263 movd %mm3, 108(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
264 movd %mm7, 100(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
265 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
266 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
267 movd %mm3, 104(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
268 movd %mm7, 96(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
269
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
270 /* Phase 4*/
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
271
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
272 movq (%ecx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
273 movq 8(%ecx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
274 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
275 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
276 movq 24(%ecx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
277 movq 16(%ecx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
278 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
279 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
280 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
281 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
282 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
283 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
284 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
285 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
286 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
287 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
288 movq %mm0, (%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
289 movq %mm4, 8(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
290 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
291 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
292 pfmul 96(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
293 pfmul 104(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
294 movd %mm3, 28(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
295 movd %mm7, 20(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
296 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
297 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
298 movd %mm3, 24(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
299 movd %mm7, 16(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
300
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
301 movq 32(%ecx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
302 movq 40(%ecx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
303 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
304 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
305 movq 56(%ecx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
306 movq 48(%ecx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
307 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
308 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
309 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
310 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
311 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
312 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
313 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
314 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
315 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
316 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
317 movq %mm0, 32(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
318 movq %mm4, 40(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
319 pfsubr %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
320 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
321 pfmul 96(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
322 pfmul 104(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
323 movd %mm3, 60(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
324 movd %mm7, 52(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
325 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
326 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
327 movd %mm3, 56(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
328 movd %mm7, 48(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
329
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
330 movq 64(%ecx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
331 movq 72(%ecx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
332 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
333 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
334 movq 88(%ecx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
335 movq 80(%ecx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
336 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
337 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
338 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
339 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
340 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
341 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
342 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
343 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
344 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
345 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
346 movq %mm0, 64(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
347 movq %mm4, 72(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
348 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
349 pfsub %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
350 pfmul 96(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
351 pfmul 104(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
352 movd %mm3, 92(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
353 movd %mm7, 84(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
354 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
355 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
356 movd %mm3, 88(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
357 movd %mm7, 80(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
358
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
359 movq 96(%ecx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
360 movq 104(%ecx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
361 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
362 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
363 movq 120(%ecx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
364 movq 112(%ecx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
365 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
366 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
367 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
368 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
369 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
370 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
371 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
372 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
373 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
374 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
375 movq %mm0, 96(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
376 movq %mm4, 104(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
377 pfsubr %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
378 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
379 pfmul 96(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
380 pfmul 104(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
381 movd %mm3, 124(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
382 movd %mm7, 116(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
383 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
384 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
385 movd %mm3, 120(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
386 movd %mm7, 112(%edx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
387
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
388 /* Phase 5 */
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
389
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
390 movq (%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
391 movq 16(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
392 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
393 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
394 movq 8(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
395 movq 24(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
396 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
397 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
398 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
399 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
400 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
401 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
402 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
403 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
404 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
405 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
406 movq %mm0, (%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
407 movq %mm4, 16(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
408 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
409 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
410 pfmul 112(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
411 pfmul 112(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
412 movd %mm3, 12(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
413 movd %mm7, 28(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
414 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
415 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
416 movd %mm3, 8(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
417 movd %mm7, 24(%ecx)
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
418
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
419 movq 32(%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
420 movq 48(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
421 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
422 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
423 movq 40(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
424 movq 56(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
425 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
426 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
427 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
428 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
429 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
430 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
431 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
432 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
433 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
434 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
435 movq %mm0, 32(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
436 movq %mm4, 48(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
437 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
438 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
439 pfmul 112(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
440 pfmul 112(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
441 movd %mm3, 44(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
442 movd %mm7, 60(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
443 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
444 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
445 movd %mm3, 40(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
446 movd %mm7, 56(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
447
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
448 movq 64(%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
449 movq 80(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
450 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
451 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
452 movq 72(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
453 movq 88(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
454 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
455 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
456 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
457 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
458 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
459 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
460 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
461 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
462 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
463 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
464 movq %mm0, 64(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
465 movq %mm4, 80(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
466 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
467 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
468 pfmul 112(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
469 pfmul 112(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
470 movd %mm3, 76(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
471 movd %mm7, 92(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
472 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
473 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
474 movd %mm3, 72(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
475 movd %mm7, 88(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
476
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
477 movq 96(%edx), %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
478 movq 112(%edx), %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
479 movq %mm0, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
480 movq %mm4, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
481 movq 104(%edx), %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
482 movq 120(%edx), %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
483 /* n.b.: pswapd*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
484 movq %mm1, %mm2
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
485 movq %mm5, %mm6
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
486 psrlq $32, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
487 psrlq $32, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
488 punpckldq %mm2, %mm1
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
489 punpckldq %mm6, %mm5
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
490 /**/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
491 pfadd %mm1, %mm0
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
492 pfadd %mm5, %mm4
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
493 movq %mm0, 96(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
494 movq %mm4, 112(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
495 pfsub %mm1, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
496 pfsubr %mm5, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
497 pfmul 112(%ebx), %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
498 pfmul 112(%ebx), %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
499 movd %mm3, 108(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
500 movd %mm7, 124(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
501 psrlq $32, %mm3
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
502 psrlq $32, %mm7
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
503 movd %mm3, 104(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
504 movd %mm7, 120(%ecx)
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
505
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
506 /* Phase 6. This is the end of easy road. */
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
507 /* Code below is coded in scalar mode. Should be optimized */
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
508
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
509 movd plus_1f, %mm6
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
510 punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
511 movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
512
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
513 movq 32(%ecx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
514 movq 64(%ecx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
515 movq %mm0, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
516 movq %mm2, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
517 pxor %mm7, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
518 pxor %mm7, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
519 pfacc %mm1, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
520 pfacc %mm3, %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
521 pfmul %mm6, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
522 pfmul %mm6, %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
523 movq %mm0, 32(%edx)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
524 movq %mm2, 64(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
525
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
526 movd 44(%ecx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
527 movd 40(%ecx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
528 movd 120(%ebx), %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
529 punpckldq 76(%ecx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
530 punpckldq 72(%ecx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
531 punpckldq %mm3, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
532 movq %mm0, %mm4
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
533 movq %mm2, %mm5
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
534 pfsub %mm2, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
535 pfmul %mm3, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
536 movq %mm0, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
537 pfadd %mm5, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
538 pfadd %mm4, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
539 movq %mm0, %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
540 punpckldq %mm1, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
541 punpckhdq %mm1, %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
542 movq %mm0, 40(%edx)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
543 movq %mm2, 72(%edx)
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
544
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
545 movd 48(%ecx), %mm3
1282
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
546 movd 60(%ecx), %mm2
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
547 pfsub 52(%ecx), %mm3
1282
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
548 pfsub 56(%ecx), %mm2
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
549 pfmul 120(%ebx), %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
550 pfmul 120(%ebx), %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
551 movq %mm2, %mm1
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
552
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
553 pfadd 56(%ecx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
554 pfadd 60(%ecx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
555 movq %mm1, %mm0
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
556
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
557 pfadd 48(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
558 pfadd 52(%ecx), %mm0
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
559 pfadd %mm3, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
560 punpckldq %mm2, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
561 pfadd %mm3, %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
562 punpckldq %mm2, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
563 movq %mm1, 56(%edx)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
564 movq %mm0, 48(%edx)
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
565
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
566 /*---*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
567
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
568 movd 92(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
569 pfsub 88(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
570 pfmul 120(%ebx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
571 movd %mm1, 92(%edx)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
572 pfadd 92(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
573 pfadd 88(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
574 movq %mm1, %mm0
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
575
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
576 pfadd 80(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
577 pfadd 84(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
578 movd %mm0, 80(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
579
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
580 movd 80(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
581 pfsub 84(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
582 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
583 pfadd %mm0, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
584 pfadd 92(%edx), %mm0
1282
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
585 punpckldq %mm1, %mm0
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
586 movq %mm0, 84(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
587
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
588 movq 96(%ecx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
589 movq %mm0, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
590 pxor %mm7, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
591 pfacc %mm1, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
592 pfmul %mm6, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
593 movq %mm0, 96(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
594
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
595 movd 108(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
596 pfsub 104(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
597 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
598 movd %mm0, 108(%edx)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
599 pfadd 104(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
600 pfadd 108(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
601 movd %mm0, 104(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
602
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
603 movd 124(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
604 pfsub 120(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
605 pfmul 120(%ebx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
606 movd %mm1, 124(%edx)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
607 pfadd 120(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
608 pfadd 124(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
609 movq %mm1, %mm0
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
610
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
611 pfadd 112(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
612 pfadd 116(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
613 movd %mm0, 112(%edx)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
614
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
615 movd 112(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
616 pfsub 116(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
617 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
618 pfadd %mm0,%mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
619 pfadd 124(%edx), %mm0
1282
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
620 punpckldq %mm1, %mm0
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
621 movq %mm0, 116(%edx)
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
622
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
623 jnz .L01
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
624
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
625 /* Phase 7*/
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
626 /* Code below is coded in scalar mode. Should be optimized */
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
627
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
628 movd (%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
629 pfadd 4(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
630 movd %mm0, 1024(%esi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
631
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
632 movd (%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
633 pfsub 4(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
634 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
635 movd %mm0, (%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
636 movd %mm0, (%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
637
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
638 movd 12(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
639 pfsub 8(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
640 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
641 movd %mm0, 512(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
642 pfadd 12(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
643 pfadd 8(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
644 movd %mm0, 512(%esi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
645
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
646 movd 16(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
647 pfsub 20(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
648 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
649 movq %mm0, %mm3
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
650
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
651 movd 28(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
652 pfsub 24(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
653 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
654 movd %mm0, 768(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
655 movq %mm0, %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
656
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
657 pfadd 24(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
658 pfadd 28(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
659 movq %mm0, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
660
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
661 pfadd 16(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
662 pfadd 20(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
663 movd %mm0, 768(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
664 pfadd %mm3, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
665 movd %mm1, 256(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
666 pfadd %mm3, %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
667 movd %mm2, 256(%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
668
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
669 /* Phase 8*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
670
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
671 movq 32(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
672 movq 48(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
673 pfadd 48(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
674 pfadd 40(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
675 movd %mm0, 896(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
676 movd %mm1, 640(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
677 psrlq $32, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
678 psrlq $32, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
679 movd %mm0, 128(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
680 movd %mm1, 384(%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
681
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
682 movd 40(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
683 pfadd 56(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
684 movd %mm0, 384(%esi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
685
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
686 movd 56(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
687 pfadd 36(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
688 movd %mm0, 128(%esi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
689
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
690 movd 60(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
691 movd %mm0, 896(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
692 pfadd 44(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
693 movd %mm0, 640(%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
694
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
695 movq 96(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
696 movq 112(%edx), %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
697 movq 104(%edx), %mm4
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
698 pfadd 112(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
699 pfadd 104(%edx), %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
700 pfadd 120(%edx), %mm4
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
701 movq %mm0, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
702 movq %mm2, %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
703 movq %mm4, %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
704 pfadd 64(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
705 pfadd 80(%edx), %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
706 pfadd 72(%edx), %mm4
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
707 movd %mm0, 960(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
708 movd %mm2, 704(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
709 movd %mm4, 448(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
710 psrlq $32, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
711 psrlq $32, %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
712 psrlq $32, %mm4
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
713 movd %mm0, 64(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
714 movd %mm2, 320(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
715 movd %mm4, 576(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
716 pfadd 80(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
717 pfadd 72(%edx), %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
718 pfadd 88(%edx), %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
719 movd %mm1, 832(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
720 movd %mm3, 576(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
721 movd %mm5, 320(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
722 psrlq $32, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
723 psrlq $32, %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
724 psrlq $32, %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
725 movd %mm1, 192(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
726 movd %mm3, 448(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
727 movd %mm5, 704(%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
728
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
729 movd 120(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
730 pfadd 100(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
731 movq %mm0, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
732 pfadd 88(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
733 movd %mm0, 192(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
734 pfadd 68(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
735 movd %mm1, 64(%esi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
736
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
737 movd 124(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
738 movd %mm0, 960(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
739 pfadd 92(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
740 movd %mm0, 832(%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
741
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
742 jmp .L_bye
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
743 .L01:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
744 /* Phase 9*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
745
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
746 movq (%ecx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
747 movq %mm0, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
748 pxor %mm7, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
749 pfacc %mm1, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
750 pfmul %mm6, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
751 pf2id %mm0, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
752 movd %mm0, %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
753 movw %ax, 512(%esi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
754 psrlq $32, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
755 movd %mm0, %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
756 movw %ax, (%esi)
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
757
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
758 movd 12(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
759 pfsub 8(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
760 pfmul 120(%ebx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
761 pf2id %mm0, %mm7
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
762 movd %mm7, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
763 movw %ax, 256(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
764 pfadd 12(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
765 pfadd 8(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
766 pf2id %mm0, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
767 movd %mm0, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
768 movw %ax, 256(%esi)
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
769
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
770 movd 16(%ecx), %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
771 pfsub 20(%ecx), %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
772 pfmul 120(%ebx), %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
773 movq %mm3, %mm2
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
774
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
775 movd 28(%ecx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
776 pfsub 24(%ecx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
777 pfmul 120(%ebx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
778 movq %mm2, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
779
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
780 pf2id %mm2, %mm7
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
781 movd %mm7, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
782 movw %ax, 384(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
783
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
784 pfadd 24(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
785 pfadd 28(%ecx), %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
786 movq %mm1, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
787
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
788 pfadd 16(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
789 pfadd 20(%ecx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
790 pf2id %mm0, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
791 movd %mm0, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
792 movw %ax, 384(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
793 pfadd %mm3, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
794 pf2id %mm1, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
795 movd %mm1, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
796 movw %ax, 128(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
797 pfadd %mm3, %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
798 pf2id %mm2, %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
799 movd %mm2, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
800 movw %ax, 128(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
801
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
802 /* Phase 10*/
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
803
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
804 movq 32(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
805 movq 48(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
806 pfadd 48(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
807 pfadd 40(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
808 pf2id %mm0, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
809 pf2id %mm1, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
810 movd %mm0, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
811 movd %mm1, %ecx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
812 movw %ax, 448(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
813 movw %cx, 320(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
814 psrlq $32, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
815 psrlq $32, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
816 movd %mm0, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
817 movd %mm1, %ecx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
818 movw %ax, 64(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
819 movw %cx, 192(%edi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
820
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
821 movd 40(%edx), %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
822 movd 56(%edx), %mm4
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
823 movd 60(%edx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
824 movd 44(%edx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
825 movd 120(%edx), %mm5
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
826 punpckldq %mm4, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
827 punpckldq 124(%edx), %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
828 pfadd 100(%edx), %mm5
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
829 punpckldq 36(%edx), %mm4
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
830 punpckldq 92(%edx), %mm2
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
831 movq %mm5, %mm6
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
832 pfadd %mm4, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
833 pf2id %mm0, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
834 pf2id %mm3, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
835 pfadd 88(%edx), %mm5
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
836 movd %mm1, %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
837 movd %mm3, %ecx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
838 movw %ax, 448(%edi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
839 movw %cx, 192(%esi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
840 pf2id %mm5, %mm5
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
841 psrlq $32, %mm1
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
842 psrlq $32, %mm3
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
843 movd %mm5, %ebx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
844 movd %mm1, %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
845 movd %mm3, %ecx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
846 movw %bx, 96(%esi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
847 movw %ax, 480(%edi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
848 movw %cx, 64(%esi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
849 pfadd %mm2, %mm0
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
850 pf2id %mm0, %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
851 movd %mm0, %eax
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
852 pfadd 68(%edx), %mm6
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
853 movw %ax, 320(%edi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
854 psrlq $32, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
855 pf2id %mm6, %mm6
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
856 movd %mm0, %eax
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
857 movd %mm6, %ebx
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
858 movw %ax, 416(%edi)
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
859 movw %bx, 32(%esi)
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
860
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
861 movq 96(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
862 movq 112(%edx), %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
863 movq 104(%edx), %mm4
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
864 pfadd %mm2, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
865 pfadd %mm4, %mm2
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
866 pfadd 120(%edx), %mm4
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
867 movq %mm0, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
868 movq %mm2, %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
869 movq %mm4, %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
870 pfadd 64(%edx), %mm0
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
871 pfadd 80(%edx), %mm2
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
872 pfadd 72(%edx), %mm4
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
873 pf2id %mm0, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
874 pf2id %mm2, %mm2
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
875 pf2id %mm4, %mm4
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
876 movd %mm0, %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
877 movd %mm2, %ecx
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
878 movd %mm4, %ebx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
879 movw %ax, 480(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
880 movw %cx, 352(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
881 movw %bx, 224(%esi)
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
882 psrlq $32, %mm0
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
883 psrlq $32, %mm2
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
884 psrlq $32, %mm4
1277
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
885 movd %mm0, %eax
3a9699d9e7da Slight otimization
nick
parents: 1271
diff changeset
886 movd %mm2, %ecx
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
887 movd %mm4, %ebx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
888 movw %ax, 32(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
889 movw %cx, 160(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
890 movw %bx, 288(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
891 pfadd 80(%edx), %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
892 pfadd 72(%edx), %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
893 pfadd 88(%edx), %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
894 pf2id %mm1, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
895 pf2id %mm3, %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
896 pf2id %mm5, %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
897 movd %mm1, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
898 movd %mm3, %ecx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
899 movd %mm5, %ebx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
900 movw %ax, 416(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
901 movw %cx, 288(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
902 movw %bx, 160(%esi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
903 psrlq $32, %mm1
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
904 psrlq $32, %mm3
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
905 psrlq $32, %mm5
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
906 movd %mm1, %eax
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
907 movd %mm3, %ecx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
908 movd %mm5, %ebx
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
909 movw %ax, 96(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
910 movw %cx, 224(%edi)
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
911 movw %bx, 352(%edi)
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
912
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
913 movsw
1
3b5f5d1c5041 Initial revision
arpi_esp
parents:
diff changeset
914
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
915 .L_bye:
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
916 addl $256,%esp
1271
2864e32cd267 Finished 3dnow optimization (in scalar mode) and minor improvements
nick
parents: 1245
diff changeset
917 femms
1245
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
918 popl %edi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
919 popl %esi
03b7e2955a20 Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
nick
parents: 1
diff changeset
920 popl %ebx
1282
175423b2691e Minor optimization
nick
parents: 1277
diff changeset
921 ret $12