Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_rnd.h @ 2979:bfabfdf9ce55 libavcodec
COSMETICS: tabs --> spaces, some prettyprinting
author | diego |
---|---|
date | Thu, 22 Dec 2005 01:10:11 +0000 |
parents | ef2149182f1c |
children | 0b546eab515d |
comparison
equal
deleted
inserted
replaced
2978:403183bbb505 | 2979:bfabfdf9ce55 |
---|---|
25 // put_pixels | 25 // put_pixels |
26 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 26 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
27 { | 27 { |
28 MOVQ_BFE(mm6); | 28 MOVQ_BFE(mm6); |
29 __asm __volatile( | 29 __asm __volatile( |
30 "lea (%3, %3), %%"REG_a" \n\t" | 30 "lea (%3, %3), %%"REG_a" \n\t" |
31 ".balign 8 \n\t" | 31 ".balign 8 \n\t" |
32 "1: \n\t" | 32 "1: \n\t" |
33 "movq (%1), %%mm0 \n\t" | 33 "movq (%1), %%mm0 \n\t" |
34 "movq 1(%1), %%mm1 \n\t" | 34 "movq 1(%1), %%mm1 \n\t" |
35 "movq (%1, %3), %%mm2 \n\t" | 35 "movq (%1, %3), %%mm2 \n\t" |
36 "movq 1(%1, %3), %%mm3 \n\t" | 36 "movq 1(%1, %3), %%mm3 \n\t" |
37 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 37 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
38 "movq %%mm4, (%2) \n\t" | 38 "movq %%mm4, (%2) \n\t" |
39 "movq %%mm5, (%2, %3) \n\t" | 39 "movq %%mm5, (%2, %3) \n\t" |
40 "add %%"REG_a", %1 \n\t" | 40 "add %%"REG_a", %1 \n\t" |
41 "add %%"REG_a", %2 \n\t" | 41 "add %%"REG_a", %2 \n\t" |
42 "movq (%1), %%mm0 \n\t" | 42 "movq (%1), %%mm0 \n\t" |
43 "movq 1(%1), %%mm1 \n\t" | 43 "movq 1(%1), %%mm1 \n\t" |
44 "movq (%1, %3), %%mm2 \n\t" | 44 "movq (%1, %3), %%mm2 \n\t" |
45 "movq 1(%1, %3), %%mm3 \n\t" | 45 "movq 1(%1, %3), %%mm3 \n\t" |
46 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 46 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
47 "movq %%mm4, (%2) \n\t" | 47 "movq %%mm4, (%2) \n\t" |
48 "movq %%mm5, (%2, %3) \n\t" | 48 "movq %%mm5, (%2, %3) \n\t" |
49 "add %%"REG_a", %1 \n\t" | 49 "add %%"REG_a", %1 \n\t" |
50 "add %%"REG_a", %2 \n\t" | 50 "add %%"REG_a", %2 \n\t" |
51 "subl $4, %0 \n\t" | 51 "subl $4, %0 \n\t" |
52 "jnz 1b \n\t" | 52 "jnz 1b \n\t" |
53 :"+g"(h), "+S"(pixels), "+D"(block) | 53 :"+g"(h), "+S"(pixels), "+D"(block) |
54 :"r"((long)line_size) | 54 :"r"((long)line_size) |
55 :REG_a, "memory"); | 55 :REG_a, "memory"); |
56 } | 56 } |
57 | 57 |
58 static void attribute_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 58 static void attribute_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
59 { | 59 { |
60 MOVQ_BFE(mm6); | 60 MOVQ_BFE(mm6); |
61 __asm __volatile( | 61 __asm __volatile( |
62 "testl $1, %0 \n\t" | 62 "testl $1, %0 \n\t" |
63 " jz 1f \n\t" | 63 " jz 1f \n\t" |
64 "movq (%1), %%mm0 \n\t" | 64 "movq (%1), %%mm0 \n\t" |
65 "movq (%2), %%mm1 \n\t" | 65 "movq (%2), %%mm1 \n\t" |
66 "add %4, %1 \n\t" | 66 "add %4, %1 \n\t" |
67 "add $8, %2 \n\t" | 67 "add $8, %2 \n\t" |
68 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) | 68 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) |
69 "movq %%mm4, (%3) \n\t" | 69 "movq %%mm4, (%3) \n\t" |
70 "add %5, %3 \n\t" | 70 "add %5, %3 \n\t" |
71 "decl %0 \n\t" | 71 "decl %0 \n\t" |
72 ".balign 8 \n\t" | 72 ".balign 8 \n\t" |
73 "1: \n\t" | 73 "1: \n\t" |
74 "movq (%1), %%mm0 \n\t" | 74 "movq (%1), %%mm0 \n\t" |
75 "movq (%2), %%mm1 \n\t" | 75 "movq (%2), %%mm1 \n\t" |
76 "add %4, %1 \n\t" | 76 "add %4, %1 \n\t" |
77 "movq (%1), %%mm2 \n\t" | 77 "movq (%1), %%mm2 \n\t" |
78 "movq 8(%2), %%mm3 \n\t" | 78 "movq 8(%2), %%mm3 \n\t" |
79 "add %4, %1 \n\t" | 79 "add %4, %1 \n\t" |
80 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 80 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
81 "movq %%mm4, (%3) \n\t" | 81 "movq %%mm4, (%3) \n\t" |
82 "add %5, %3 \n\t" | 82 "add %5, %3 \n\t" |
83 "movq %%mm5, (%3) \n\t" | 83 "movq %%mm5, (%3) \n\t" |
84 "add %5, %3 \n\t" | 84 "add %5, %3 \n\t" |
85 "movq (%1), %%mm0 \n\t" | 85 "movq (%1), %%mm0 \n\t" |
86 "movq 16(%2), %%mm1 \n\t" | 86 "movq 16(%2), %%mm1 \n\t" |
87 "add %4, %1 \n\t" | 87 "add %4, %1 \n\t" |
88 "movq (%1), %%mm2 \n\t" | 88 "movq (%1), %%mm2 \n\t" |
89 "movq 24(%2), %%mm3 \n\t" | 89 "movq 24(%2), %%mm3 \n\t" |
90 "add %4, %1 \n\t" | 90 "add %4, %1 \n\t" |
91 "add $32, %2 \n\t" | 91 "add $32, %2 \n\t" |
92 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 92 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
93 "movq %%mm4, (%3) \n\t" | 93 "movq %%mm4, (%3) \n\t" |
94 "add %5, %3 \n\t" | 94 "add %5, %3 \n\t" |
95 "movq %%mm5, (%3) \n\t" | 95 "movq %%mm5, (%3) \n\t" |
96 "add %5, %3 \n\t" | 96 "add %5, %3 \n\t" |
97 "subl $4, %0 \n\t" | 97 "subl $4, %0 \n\t" |
98 "jnz 1b \n\t" | 98 "jnz 1b \n\t" |
99 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | 99 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | 100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
101 #else | 101 #else |
102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | 102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
103 #endif | 103 #endif |
104 :"S"((long)src1Stride), "D"((long)dstStride) | 104 :"S"((long)src1Stride), "D"((long)dstStride) |
105 :"memory"); | 105 :"memory"); |
106 } | 106 } |
107 | 107 |
108 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 108 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
109 { | 109 { |
110 MOVQ_BFE(mm6); | 110 MOVQ_BFE(mm6); |
111 __asm __volatile( | 111 __asm __volatile( |
112 "lea (%3, %3), %%"REG_a" \n\t" | 112 "lea (%3, %3), %%"REG_a" \n\t" |
113 ".balign 8 \n\t" | 113 ".balign 8 \n\t" |
114 "1: \n\t" | 114 "1: \n\t" |
115 "movq (%1), %%mm0 \n\t" | 115 "movq (%1), %%mm0 \n\t" |
116 "movq 1(%1), %%mm1 \n\t" | 116 "movq 1(%1), %%mm1 \n\t" |
117 "movq (%1, %3), %%mm2 \n\t" | 117 "movq (%1, %3), %%mm2 \n\t" |
118 "movq 1(%1, %3), %%mm3 \n\t" | 118 "movq 1(%1, %3), %%mm3 \n\t" |
119 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 119 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
120 "movq %%mm4, (%2) \n\t" | 120 "movq %%mm4, (%2) \n\t" |
121 "movq %%mm5, (%2, %3) \n\t" | 121 "movq %%mm5, (%2, %3) \n\t" |
122 "movq 8(%1), %%mm0 \n\t" | 122 "movq 8(%1), %%mm0 \n\t" |
123 "movq 9(%1), %%mm1 \n\t" | 123 "movq 9(%1), %%mm1 \n\t" |
124 "movq 8(%1, %3), %%mm2 \n\t" | 124 "movq 8(%1, %3), %%mm2 \n\t" |
125 "movq 9(%1, %3), %%mm3 \n\t" | 125 "movq 9(%1, %3), %%mm3 \n\t" |
126 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 126 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
127 "movq %%mm4, 8(%2) \n\t" | 127 "movq %%mm4, 8(%2) \n\t" |
128 "movq %%mm5, 8(%2, %3) \n\t" | 128 "movq %%mm5, 8(%2, %3) \n\t" |
129 "add %%"REG_a", %1 \n\t" | 129 "add %%"REG_a", %1 \n\t" |
130 "add %%"REG_a", %2 \n\t" | 130 "add %%"REG_a", %2 \n\t" |
131 "movq (%1), %%mm0 \n\t" | 131 "movq (%1), %%mm0 \n\t" |
132 "movq 1(%1), %%mm1 \n\t" | 132 "movq 1(%1), %%mm1 \n\t" |
133 "movq (%1, %3), %%mm2 \n\t" | 133 "movq (%1, %3), %%mm2 \n\t" |
134 "movq 1(%1, %3), %%mm3 \n\t" | 134 "movq 1(%1, %3), %%mm3 \n\t" |
135 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 135 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
136 "movq %%mm4, (%2) \n\t" | 136 "movq %%mm4, (%2) \n\t" |
137 "movq %%mm5, (%2, %3) \n\t" | 137 "movq %%mm5, (%2, %3) \n\t" |
138 "movq 8(%1), %%mm0 \n\t" | 138 "movq 8(%1), %%mm0 \n\t" |
139 "movq 9(%1), %%mm1 \n\t" | 139 "movq 9(%1), %%mm1 \n\t" |
140 "movq 8(%1, %3), %%mm2 \n\t" | 140 "movq 8(%1, %3), %%mm2 \n\t" |
141 "movq 9(%1, %3), %%mm3 \n\t" | 141 "movq 9(%1, %3), %%mm3 \n\t" |
142 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 142 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
143 "movq %%mm4, 8(%2) \n\t" | 143 "movq %%mm4, 8(%2) \n\t" |
144 "movq %%mm5, 8(%2, %3) \n\t" | 144 "movq %%mm5, 8(%2, %3) \n\t" |
145 "add %%"REG_a", %1 \n\t" | 145 "add %%"REG_a", %1 \n\t" |
146 "add %%"REG_a", %2 \n\t" | 146 "add %%"REG_a", %2 \n\t" |
147 "subl $4, %0 \n\t" | 147 "subl $4, %0 \n\t" |
148 "jnz 1b \n\t" | 148 "jnz 1b \n\t" |
149 :"+g"(h), "+S"(pixels), "+D"(block) | 149 :"+g"(h), "+S"(pixels), "+D"(block) |
150 :"r"((long)line_size) | 150 :"r"((long)line_size) |
151 :REG_a, "memory"); | 151 :REG_a, "memory"); |
152 } | 152 } |
153 | 153 |
154 static void attribute_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 154 static void attribute_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
155 { | 155 { |
156 MOVQ_BFE(mm6); | 156 MOVQ_BFE(mm6); |
157 __asm __volatile( | 157 __asm __volatile( |
158 "testl $1, %0 \n\t" | 158 "testl $1, %0 \n\t" |
159 " jz 1f \n\t" | 159 " jz 1f \n\t" |
160 "movq (%1), %%mm0 \n\t" | 160 "movq (%1), %%mm0 \n\t" |
161 "movq (%2), %%mm1 \n\t" | 161 "movq (%2), %%mm1 \n\t" |
162 "movq 8(%1), %%mm2 \n\t" | 162 "movq 8(%1), %%mm2 \n\t" |
163 "movq 8(%2), %%mm3 \n\t" | 163 "movq 8(%2), %%mm3 \n\t" |
164 "add %4, %1 \n\t" | 164 "add %4, %1 \n\t" |
165 "add $16, %2 \n\t" | 165 "add $16, %2 \n\t" |
166 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 166 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
167 "movq %%mm4, (%3) \n\t" | 167 "movq %%mm4, (%3) \n\t" |
168 "movq %%mm5, 8(%3) \n\t" | 168 "movq %%mm5, 8(%3) \n\t" |
169 "add %5, %3 \n\t" | 169 "add %5, %3 \n\t" |
170 "decl %0 \n\t" | 170 "decl %0 \n\t" |
171 ".balign 8 \n\t" | 171 ".balign 8 \n\t" |
172 "1: \n\t" | 172 "1: \n\t" |
173 "movq (%1), %%mm0 \n\t" | 173 "movq (%1), %%mm0 \n\t" |
174 "movq (%2), %%mm1 \n\t" | 174 "movq (%2), %%mm1 \n\t" |
175 "movq 8(%1), %%mm2 \n\t" | 175 "movq 8(%1), %%mm2 \n\t" |
176 "movq 8(%2), %%mm3 \n\t" | 176 "movq 8(%2), %%mm3 \n\t" |
177 "add %4, %1 \n\t" | 177 "add %4, %1 \n\t" |
178 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 178 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
179 "movq %%mm4, (%3) \n\t" | 179 "movq %%mm4, (%3) \n\t" |
180 "movq %%mm5, 8(%3) \n\t" | 180 "movq %%mm5, 8(%3) \n\t" |
181 "add %5, %3 \n\t" | 181 "add %5, %3 \n\t" |
182 "movq (%1), %%mm0 \n\t" | 182 "movq (%1), %%mm0 \n\t" |
183 "movq 16(%2), %%mm1 \n\t" | 183 "movq 16(%2), %%mm1 \n\t" |
184 "movq 8(%1), %%mm2 \n\t" | 184 "movq 8(%1), %%mm2 \n\t" |
185 "movq 24(%2), %%mm3 \n\t" | 185 "movq 24(%2), %%mm3 \n\t" |
186 "add %4, %1 \n\t" | 186 "add %4, %1 \n\t" |
187 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | 187 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
188 "movq %%mm4, (%3) \n\t" | 188 "movq %%mm4, (%3) \n\t" |
189 "movq %%mm5, 8(%3) \n\t" | 189 "movq %%mm5, 8(%3) \n\t" |
190 "add %5, %3 \n\t" | 190 "add %5, %3 \n\t" |
191 "add $32, %2 \n\t" | 191 "add $32, %2 \n\t" |
192 "subl $2, %0 \n\t" | 192 "subl $2, %0 \n\t" |
193 "jnz 1b \n\t" | 193 "jnz 1b \n\t" |
194 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used | 194 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used |
195 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) | 195 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
196 #else | 196 #else |
197 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | 197 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
198 #endif | 198 #endif |
199 :"S"((long)src1Stride), "D"((long)dstStride) | 199 :"S"((long)src1Stride), "D"((long)dstStride) |
200 :"memory"); | 200 :"memory"); |
201 } | 201 } |
202 | 202 |
203 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 203 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
204 { | 204 { |
205 MOVQ_BFE(mm6); | 205 MOVQ_BFE(mm6); |
206 __asm __volatile( | 206 __asm __volatile( |
207 "lea (%3, %3), %%"REG_a" \n\t" | 207 "lea (%3, %3), %%"REG_a" \n\t" |
208 "movq (%1), %%mm0 \n\t" | 208 "movq (%1), %%mm0 \n\t" |
209 ".balign 8 \n\t" | 209 ".balign 8 \n\t" |
210 "1: \n\t" | 210 "1: \n\t" |
211 "movq (%1, %3), %%mm1 \n\t" | 211 "movq (%1, %3), %%mm1 \n\t" |
212 "movq (%1, %%"REG_a"),%%mm2 \n\t" | 212 "movq (%1, %%"REG_a"),%%mm2 \n\t" |
213 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | 213 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
214 "movq %%mm4, (%2) \n\t" | 214 "movq %%mm4, (%2) \n\t" |
215 "movq %%mm5, (%2, %3) \n\t" | 215 "movq %%mm5, (%2, %3) \n\t" |
216 "add %%"REG_a", %1 \n\t" | 216 "add %%"REG_a", %1 \n\t" |
217 "add %%"REG_a", %2 \n\t" | 217 "add %%"REG_a", %2 \n\t" |
218 "movq (%1, %3), %%mm1 \n\t" | 218 "movq (%1, %3), %%mm1 \n\t" |
219 "movq (%1, %%"REG_a"),%%mm0 \n\t" | 219 "movq (%1, %%"REG_a"),%%mm0 \n\t" |
220 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | 220 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
221 "movq %%mm4, (%2) \n\t" | 221 "movq %%mm4, (%2) \n\t" |
222 "movq %%mm5, (%2, %3) \n\t" | 222 "movq %%mm5, (%2, %3) \n\t" |
223 "add %%"REG_a", %1 \n\t" | 223 "add %%"REG_a", %1 \n\t" |
224 "add %%"REG_a", %2 \n\t" | 224 "add %%"REG_a", %2 \n\t" |
225 "subl $4, %0 \n\t" | 225 "subl $4, %0 \n\t" |
226 "jnz 1b \n\t" | 226 "jnz 1b \n\t" |
227 :"+g"(h), "+S"(pixels), "+D"(block) | 227 :"+g"(h), "+S"(pixels), "+D"(block) |
228 :"r"((long)line_size) | 228 :"r"((long)line_size) |
229 :REG_a, "memory"); | 229 :REG_a, "memory"); |
230 } | 230 } |
231 | 231 |
232 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 232 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
233 { | 233 { |
234 MOVQ_ZERO(mm7); | 234 MOVQ_ZERO(mm7); |
235 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | 235 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
236 __asm __volatile( | 236 __asm __volatile( |
237 "movq (%1), %%mm0 \n\t" | 237 "movq (%1), %%mm0 \n\t" |
238 "movq 1(%1), %%mm4 \n\t" | 238 "movq 1(%1), %%mm4 \n\t" |
239 "movq %%mm0, %%mm1 \n\t" | 239 "movq %%mm0, %%mm1 \n\t" |
240 "movq %%mm4, %%mm5 \n\t" | 240 "movq %%mm4, %%mm5 \n\t" |
241 "punpcklbw %%mm7, %%mm0 \n\t" | 241 "punpcklbw %%mm7, %%mm0 \n\t" |
242 "punpcklbw %%mm7, %%mm4 \n\t" | 242 "punpcklbw %%mm7, %%mm4 \n\t" |
243 "punpckhbw %%mm7, %%mm1 \n\t" | 243 "punpckhbw %%mm7, %%mm1 \n\t" |
244 "punpckhbw %%mm7, %%mm5 \n\t" | 244 "punpckhbw %%mm7, %%mm5 \n\t" |
245 "paddusw %%mm0, %%mm4 \n\t" | 245 "paddusw %%mm0, %%mm4 \n\t" |
246 "paddusw %%mm1, %%mm5 \n\t" | 246 "paddusw %%mm1, %%mm5 \n\t" |
247 "xor %%"REG_a", %%"REG_a" \n\t" | 247 "xor %%"REG_a", %%"REG_a" \n\t" |
248 "add %3, %1 \n\t" | 248 "add %3, %1 \n\t" |
249 ".balign 8 \n\t" | 249 ".balign 8 \n\t" |
250 "1: \n\t" | 250 "1: \n\t" |
251 "movq (%1, %%"REG_a"), %%mm0 \n\t" | 251 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
252 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | 252 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
253 "movq %%mm0, %%mm1 \n\t" | 253 "movq %%mm0, %%mm1 \n\t" |
254 "movq %%mm2, %%mm3 \n\t" | 254 "movq %%mm2, %%mm3 \n\t" |
255 "punpcklbw %%mm7, %%mm0 \n\t" | 255 "punpcklbw %%mm7, %%mm0 \n\t" |
256 "punpcklbw %%mm7, %%mm2 \n\t" | 256 "punpcklbw %%mm7, %%mm2 \n\t" |
257 "punpckhbw %%mm7, %%mm1 \n\t" | 257 "punpckhbw %%mm7, %%mm1 \n\t" |
258 "punpckhbw %%mm7, %%mm3 \n\t" | 258 "punpckhbw %%mm7, %%mm3 \n\t" |
259 "paddusw %%mm2, %%mm0 \n\t" | 259 "paddusw %%mm2, %%mm0 \n\t" |
260 "paddusw %%mm3, %%mm1 \n\t" | 260 "paddusw %%mm3, %%mm1 \n\t" |
261 "paddusw %%mm6, %%mm4 \n\t" | 261 "paddusw %%mm6, %%mm4 \n\t" |
262 "paddusw %%mm6, %%mm5 \n\t" | 262 "paddusw %%mm6, %%mm5 \n\t" |
263 "paddusw %%mm0, %%mm4 \n\t" | 263 "paddusw %%mm0, %%mm4 \n\t" |
264 "paddusw %%mm1, %%mm5 \n\t" | 264 "paddusw %%mm1, %%mm5 \n\t" |
265 "psrlw $2, %%mm4 \n\t" | 265 "psrlw $2, %%mm4 \n\t" |
266 "psrlw $2, %%mm5 \n\t" | 266 "psrlw $2, %%mm5 \n\t" |
267 "packuswb %%mm5, %%mm4 \n\t" | 267 "packuswb %%mm5, %%mm4 \n\t" |
268 "movq %%mm4, (%2, %%"REG_a") \n\t" | 268 "movq %%mm4, (%2, %%"REG_a") \n\t" |
269 "add %3, %%"REG_a" \n\t" | 269 "add %3, %%"REG_a" \n\t" |
270 | 270 |
271 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | 271 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
272 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | 272 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
273 "movq %%mm2, %%mm3 \n\t" | 273 "movq %%mm2, %%mm3 \n\t" |
274 "movq %%mm4, %%mm5 \n\t" | 274 "movq %%mm4, %%mm5 \n\t" |
275 "punpcklbw %%mm7, %%mm2 \n\t" | 275 "punpcklbw %%mm7, %%mm2 \n\t" |
276 "punpcklbw %%mm7, %%mm4 \n\t" | 276 "punpcklbw %%mm7, %%mm4 \n\t" |
277 "punpckhbw %%mm7, %%mm3 \n\t" | 277 "punpckhbw %%mm7, %%mm3 \n\t" |
278 "punpckhbw %%mm7, %%mm5 \n\t" | 278 "punpckhbw %%mm7, %%mm5 \n\t" |
279 "paddusw %%mm2, %%mm4 \n\t" | 279 "paddusw %%mm2, %%mm4 \n\t" |
280 "paddusw %%mm3, %%mm5 \n\t" | 280 "paddusw %%mm3, %%mm5 \n\t" |
281 "paddusw %%mm6, %%mm0 \n\t" | 281 "paddusw %%mm6, %%mm0 \n\t" |
282 "paddusw %%mm6, %%mm1 \n\t" | 282 "paddusw %%mm6, %%mm1 \n\t" |
283 "paddusw %%mm4, %%mm0 \n\t" | 283 "paddusw %%mm4, %%mm0 \n\t" |
284 "paddusw %%mm5, %%mm1 \n\t" | 284 "paddusw %%mm5, %%mm1 \n\t" |
285 "psrlw $2, %%mm0 \n\t" | 285 "psrlw $2, %%mm0 \n\t" |
286 "psrlw $2, %%mm1 \n\t" | 286 "psrlw $2, %%mm1 \n\t" |
287 "packuswb %%mm1, %%mm0 \n\t" | 287 "packuswb %%mm1, %%mm0 \n\t" |
288 "movq %%mm0, (%2, %%"REG_a") \n\t" | 288 "movq %%mm0, (%2, %%"REG_a") \n\t" |
289 "add %3, %%"REG_a" \n\t" | 289 "add %3, %%"REG_a" \n\t" |
290 | 290 |
291 "subl $2, %0 \n\t" | 291 "subl $2, %0 \n\t" |
292 "jnz 1b \n\t" | 292 "jnz 1b \n\t" |
293 :"+g"(h), "+S"(pixels) | 293 :"+g"(h), "+S"(pixels) |
294 :"D"(block), "r"((long)line_size) | 294 :"D"(block), "r"((long)line_size) |
295 :REG_a, "memory"); | 295 :REG_a, "memory"); |
296 } | 296 } |
297 | 297 |
298 // avg_pixels | 298 // avg_pixels |
299 static void attribute_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 299 static void attribute_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
300 { | 300 { |
301 MOVQ_BFE(mm6); | 301 MOVQ_BFE(mm6); |
302 JUMPALIGN(); | 302 JUMPALIGN(); |
303 do { | 303 do { |
304 __asm __volatile( | 304 __asm __volatile( |
305 "movd %0, %%mm0 \n\t" | 305 "movd %0, %%mm0 \n\t" |
306 "movd %1, %%mm1 \n\t" | 306 "movd %1, %%mm1 \n\t" |
307 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 307 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
308 "movd %%mm2, %0 \n\t" | 308 "movd %%mm2, %0 \n\t" |
309 :"+m"(*block) | 309 :"+m"(*block) |
310 :"m"(*pixels) | 310 :"m"(*pixels) |
311 :"memory"); | 311 :"memory"); |
312 pixels += line_size; | 312 pixels += line_size; |
313 block += line_size; | 313 block += line_size; |
314 } | 314 } |
315 while (--h); | 315 while (--h); |
316 } | 316 } |
317 | 317 |
318 // in case more speed is needed - unroling would certainly help | 318 // in case more speed is needed - unroling would certainly help |
319 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 319 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
320 { | 320 { |
321 MOVQ_BFE(mm6); | 321 MOVQ_BFE(mm6); |
322 JUMPALIGN(); | 322 JUMPALIGN(); |
323 do { | 323 do { |
324 __asm __volatile( | 324 __asm __volatile( |
325 "movq %0, %%mm0 \n\t" | 325 "movq %0, %%mm0 \n\t" |
326 "movq %1, %%mm1 \n\t" | 326 "movq %1, %%mm1 \n\t" |
327 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 327 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
328 "movq %%mm2, %0 \n\t" | 328 "movq %%mm2, %0 \n\t" |
329 :"+m"(*block) | 329 :"+m"(*block) |
330 :"m"(*pixels) | 330 :"m"(*pixels) |
331 :"memory"); | 331 :"memory"); |
332 pixels += line_size; | 332 pixels += line_size; |
333 block += line_size; | 333 block += line_size; |
334 } | 334 } |
335 while (--h); | 335 while (--h); |
336 } | 336 } |
337 | 337 |
338 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 338 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
339 { | 339 { |
340 MOVQ_BFE(mm6); | 340 MOVQ_BFE(mm6); |
341 JUMPALIGN(); | 341 JUMPALIGN(); |
342 do { | 342 do { |
343 __asm __volatile( | 343 __asm __volatile( |
344 "movq %0, %%mm0 \n\t" | 344 "movq %0, %%mm0 \n\t" |
345 "movq %1, %%mm1 \n\t" | 345 "movq %1, %%mm1 \n\t" |
346 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 346 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
347 "movq %%mm2, %0 \n\t" | 347 "movq %%mm2, %0 \n\t" |
348 "movq 8%0, %%mm0 \n\t" | 348 "movq 8%0, %%mm0 \n\t" |
349 "movq 8%1, %%mm1 \n\t" | 349 "movq 8%1, %%mm1 \n\t" |
350 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 350 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
351 "movq %%mm2, 8%0 \n\t" | 351 "movq %%mm2, 8%0 \n\t" |
352 :"+m"(*block) | 352 :"+m"(*block) |
353 :"m"(*pixels) | 353 :"m"(*pixels) |
354 :"memory"); | 354 :"memory"); |
355 pixels += line_size; | 355 pixels += line_size; |
356 block += line_size; | 356 block += line_size; |
357 } | 357 } |
358 while (--h); | 358 while (--h); |
359 } | 359 } |
360 | 360 |
361 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 361 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
362 { | 362 { |
363 MOVQ_BFE(mm6); | 363 MOVQ_BFE(mm6); |
364 JUMPALIGN(); | 364 JUMPALIGN(); |
365 do { | 365 do { |
366 __asm __volatile( | 366 __asm __volatile( |
367 "movq %1, %%mm0 \n\t" | 367 "movq %1, %%mm0 \n\t" |
368 "movq 1%1, %%mm1 \n\t" | 368 "movq 1%1, %%mm1 \n\t" |
369 "movq %0, %%mm3 \n\t" | 369 "movq %0, %%mm3 \n\t" |
370 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 370 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
371 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | 371 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
372 "movq %%mm0, %0 \n\t" | 372 "movq %%mm0, %0 \n\t" |
373 :"+m"(*block) | 373 :"+m"(*block) |
374 :"m"(*pixels) | 374 :"m"(*pixels) |
375 :"memory"); | 375 :"memory"); |
376 pixels += line_size; | 376 pixels += line_size; |
377 block += line_size; | 377 block += line_size; |
378 } while (--h); | 378 } while (--h); |
379 } | 379 } |
380 | 380 |
381 static __attribute__((unused)) void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 381 static __attribute__((unused)) void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
382 { | 382 { |
383 MOVQ_BFE(mm6); | 383 MOVQ_BFE(mm6); |
384 JUMPALIGN(); | 384 JUMPALIGN(); |
385 do { | 385 do { |
386 __asm __volatile( | 386 __asm __volatile( |
387 "movq %1, %%mm0 \n\t" | 387 "movq %1, %%mm0 \n\t" |
388 "movq %2, %%mm1 \n\t" | 388 "movq %2, %%mm1 \n\t" |
389 "movq %0, %%mm3 \n\t" | 389 "movq %0, %%mm3 \n\t" |
390 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 390 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
391 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | 391 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
392 "movq %%mm0, %0 \n\t" | 392 "movq %%mm0, %0 \n\t" |
393 :"+m"(*dst) | 393 :"+m"(*dst) |
394 :"m"(*src1), "m"(*src2) | 394 :"m"(*src1), "m"(*src2) |
395 :"memory"); | 395 :"memory"); |
396 dst += dstStride; | 396 dst += dstStride; |
397 src1 += src1Stride; | 397 src1 += src1Stride; |
398 src2 += 8; | 398 src2 += 8; |
399 } while (--h); | 399 } while (--h); |
400 } | 400 } |
401 | 401 |
402 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 402 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
403 { | 403 { |
404 MOVQ_BFE(mm6); | 404 MOVQ_BFE(mm6); |
405 JUMPALIGN(); | 405 JUMPALIGN(); |
406 do { | 406 do { |
407 __asm __volatile( | 407 __asm __volatile( |
408 "movq %1, %%mm0 \n\t" | 408 "movq %1, %%mm0 \n\t" |
409 "movq 1%1, %%mm1 \n\t" | 409 "movq 1%1, %%mm1 \n\t" |
410 "movq %0, %%mm3 \n\t" | 410 "movq %0, %%mm3 \n\t" |
411 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 411 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
412 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | 412 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
413 "movq %%mm0, %0 \n\t" | 413 "movq %%mm0, %0 \n\t" |
414 "movq 8%1, %%mm0 \n\t" | 414 "movq 8%1, %%mm0 \n\t" |
415 "movq 9%1, %%mm1 \n\t" | 415 "movq 9%1, %%mm1 \n\t" |
416 "movq 8%0, %%mm3 \n\t" | 416 "movq 8%0, %%mm3 \n\t" |
417 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 417 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
418 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | 418 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
419 "movq %%mm0, 8%0 \n\t" | 419 "movq %%mm0, 8%0 \n\t" |
420 :"+m"(*block) | 420 :"+m"(*block) |
421 :"m"(*pixels) | 421 :"m"(*pixels) |
422 :"memory"); | 422 :"memory"); |
423 pixels += line_size; | 423 pixels += line_size; |
424 block += line_size; | 424 block += line_size; |
425 } while (--h); | 425 } while (--h); |
426 } | 426 } |
427 | 427 |
428 static __attribute__((unused)) void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 428 static __attribute__((unused)) void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
429 { | 429 { |
430 MOVQ_BFE(mm6); | 430 MOVQ_BFE(mm6); |
431 JUMPALIGN(); | 431 JUMPALIGN(); |
432 do { | 432 do { |
433 __asm __volatile( | 433 __asm __volatile( |
434 "movq %1, %%mm0 \n\t" | 434 "movq %1, %%mm0 \n\t" |
435 "movq %2, %%mm1 \n\t" | 435 "movq %2, %%mm1 \n\t" |
436 "movq %0, %%mm3 \n\t" | 436 "movq %0, %%mm3 \n\t" |
437 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 437 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
438 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | 438 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
439 "movq %%mm0, %0 \n\t" | 439 "movq %%mm0, %0 \n\t" |
440 "movq 8%1, %%mm0 \n\t" | 440 "movq 8%1, %%mm0 \n\t" |
441 "movq 8%2, %%mm1 \n\t" | 441 "movq 8%2, %%mm1 \n\t" |
442 "movq 8%0, %%mm3 \n\t" | 442 "movq 8%0, %%mm3 \n\t" |
443 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | 443 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
444 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | 444 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) |
445 "movq %%mm0, 8%0 \n\t" | 445 "movq %%mm0, 8%0 \n\t" |
446 :"+m"(*dst) | 446 :"+m"(*dst) |
447 :"m"(*src1), "m"(*src2) | 447 :"m"(*src1), "m"(*src2) |
448 :"memory"); | 448 :"memory"); |
449 dst += dstStride; | 449 dst += dstStride; |
450 src1 += src1Stride; | 450 src1 += src1Stride; |
451 src2 += 16; | 451 src2 += 16; |
452 } while (--h); | 452 } while (--h); |
453 } | 453 } |
454 | 454 |
455 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 455 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
456 { | 456 { |
457 MOVQ_BFE(mm6); | 457 MOVQ_BFE(mm6); |
458 __asm __volatile( | 458 __asm __volatile( |
459 "lea (%3, %3), %%"REG_a" \n\t" | 459 "lea (%3, %3), %%"REG_a" \n\t" |
460 "movq (%1), %%mm0 \n\t" | 460 "movq (%1), %%mm0 \n\t" |
461 ".balign 8 \n\t" | 461 ".balign 8 \n\t" |
462 "1: \n\t" | 462 "1: \n\t" |
463 "movq (%1, %3), %%mm1 \n\t" | 463 "movq (%1, %3), %%mm1 \n\t" |
464 "movq (%1, %%"REG_a"), %%mm2 \n\t" | 464 "movq (%1, %%"REG_a"), %%mm2 \n\t" |
465 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | 465 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
466 "movq (%2), %%mm3 \n\t" | 466 "movq (%2), %%mm3 \n\t" |
467 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) | 467 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) |
468 "movq (%2, %3), %%mm3 \n\t" | 468 "movq (%2, %3), %%mm3 \n\t" |
469 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) | 469 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) |
470 "movq %%mm0, (%2) \n\t" | 470 "movq %%mm0, (%2) \n\t" |
471 "movq %%mm1, (%2, %3) \n\t" | 471 "movq %%mm1, (%2, %3) \n\t" |
472 "add %%"REG_a", %1 \n\t" | 472 "add %%"REG_a", %1 \n\t" |
473 "add %%"REG_a", %2 \n\t" | 473 "add %%"REG_a", %2 \n\t" |
474 | 474 |
475 "movq (%1, %3), %%mm1 \n\t" | 475 "movq (%1, %3), %%mm1 \n\t" |
476 "movq (%1, %%"REG_a"), %%mm0 \n\t" | 476 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
477 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | 477 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
478 "movq (%2), %%mm3 \n\t" | 478 "movq (%2), %%mm3 \n\t" |
479 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) | 479 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) |
480 "movq (%2, %3), %%mm3 \n\t" | 480 "movq (%2, %3), %%mm3 \n\t" |
481 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) | 481 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) |
482 "movq %%mm2, (%2) \n\t" | 482 "movq %%mm2, (%2) \n\t" |
483 "movq %%mm1, (%2, %3) \n\t" | 483 "movq %%mm1, (%2, %3) \n\t" |
484 "add %%"REG_a", %1 \n\t" | 484 "add %%"REG_a", %1 \n\t" |
485 "add %%"REG_a", %2 \n\t" | 485 "add %%"REG_a", %2 \n\t" |
486 | 486 |
487 "subl $4, %0 \n\t" | 487 "subl $4, %0 \n\t" |
488 "jnz 1b \n\t" | 488 "jnz 1b \n\t" |
489 :"+g"(h), "+S"(pixels), "+D"(block) | 489 :"+g"(h), "+S"(pixels), "+D"(block) |
490 :"r"((long)line_size) | 490 :"r"((long)line_size) |
491 :REG_a, "memory"); | 491 :REG_a, "memory"); |
492 } | 492 } |
493 | 493 |
494 // this routine is 'slightly' suboptimal but mostly unused | 494 // this routine is 'slightly' suboptimal but mostly unused |
495 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) | 495 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
496 { | 496 { |
497 MOVQ_ZERO(mm7); | 497 MOVQ_ZERO(mm7); |
498 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | 498 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
499 __asm __volatile( | 499 __asm __volatile( |
500 "movq (%1), %%mm0 \n\t" | 500 "movq (%1), %%mm0 \n\t" |
501 "movq 1(%1), %%mm4 \n\t" | 501 "movq 1(%1), %%mm4 \n\t" |
502 "movq %%mm0, %%mm1 \n\t" | 502 "movq %%mm0, %%mm1 \n\t" |
503 "movq %%mm4, %%mm5 \n\t" | 503 "movq %%mm4, %%mm5 \n\t" |
504 "punpcklbw %%mm7, %%mm0 \n\t" | 504 "punpcklbw %%mm7, %%mm0 \n\t" |
505 "punpcklbw %%mm7, %%mm4 \n\t" | 505 "punpcklbw %%mm7, %%mm4 \n\t" |
506 "punpckhbw %%mm7, %%mm1 \n\t" | 506 "punpckhbw %%mm7, %%mm1 \n\t" |
507 "punpckhbw %%mm7, %%mm5 \n\t" | 507 "punpckhbw %%mm7, %%mm5 \n\t" |
508 "paddusw %%mm0, %%mm4 \n\t" | 508 "paddusw %%mm0, %%mm4 \n\t" |
509 "paddusw %%mm1, %%mm5 \n\t" | 509 "paddusw %%mm1, %%mm5 \n\t" |
510 "xor %%"REG_a", %%"REG_a" \n\t" | 510 "xor %%"REG_a", %%"REG_a" \n\t" |
511 "add %3, %1 \n\t" | 511 "add %3, %1 \n\t" |
512 ".balign 8 \n\t" | 512 ".balign 8 \n\t" |
513 "1: \n\t" | 513 "1: \n\t" |
514 "movq (%1, %%"REG_a"), %%mm0 \n\t" | 514 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
515 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | 515 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
516 "movq %%mm0, %%mm1 \n\t" | 516 "movq %%mm0, %%mm1 \n\t" |
517 "movq %%mm2, %%mm3 \n\t" | 517 "movq %%mm2, %%mm3 \n\t" |
518 "punpcklbw %%mm7, %%mm0 \n\t" | 518 "punpcklbw %%mm7, %%mm0 \n\t" |
519 "punpcklbw %%mm7, %%mm2 \n\t" | 519 "punpcklbw %%mm7, %%mm2 \n\t" |
520 "punpckhbw %%mm7, %%mm1 \n\t" | 520 "punpckhbw %%mm7, %%mm1 \n\t" |
521 "punpckhbw %%mm7, %%mm3 \n\t" | 521 "punpckhbw %%mm7, %%mm3 \n\t" |
522 "paddusw %%mm2, %%mm0 \n\t" | 522 "paddusw %%mm2, %%mm0 \n\t" |
523 "paddusw %%mm3, %%mm1 \n\t" | 523 "paddusw %%mm3, %%mm1 \n\t" |
524 "paddusw %%mm6, %%mm4 \n\t" | 524 "paddusw %%mm6, %%mm4 \n\t" |
525 "paddusw %%mm6, %%mm5 \n\t" | 525 "paddusw %%mm6, %%mm5 \n\t" |
526 "paddusw %%mm0, %%mm4 \n\t" | 526 "paddusw %%mm0, %%mm4 \n\t" |
527 "paddusw %%mm1, %%mm5 \n\t" | 527 "paddusw %%mm1, %%mm5 \n\t" |
528 "psrlw $2, %%mm4 \n\t" | 528 "psrlw $2, %%mm4 \n\t" |
529 "psrlw $2, %%mm5 \n\t" | 529 "psrlw $2, %%mm5 \n\t" |
530 "movq (%2, %%"REG_a"), %%mm3 \n\t" | 530 "movq (%2, %%"REG_a"), %%mm3 \n\t" |
531 "packuswb %%mm5, %%mm4 \n\t" | 531 "packuswb %%mm5, %%mm4 \n\t" |
532 "pcmpeqd %%mm2, %%mm2 \n\t" | 532 "pcmpeqd %%mm2, %%mm2 \n\t" |
533 "paddb %%mm2, %%mm2 \n\t" | 533 "paddb %%mm2, %%mm2 \n\t" |
534 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) | 534 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) |
535 "movq %%mm5, (%2, %%"REG_a") \n\t" | 535 "movq %%mm5, (%2, %%"REG_a") \n\t" |
536 "add %3, %%"REG_a" \n\t" | 536 "add %3, %%"REG_a" \n\t" |
537 | 537 |
538 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | 538 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
539 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | 539 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
540 "movq %%mm2, %%mm3 \n\t" | 540 "movq %%mm2, %%mm3 \n\t" |
541 "movq %%mm4, %%mm5 \n\t" | 541 "movq %%mm4, %%mm5 \n\t" |
542 "punpcklbw %%mm7, %%mm2 \n\t" | 542 "punpcklbw %%mm7, %%mm2 \n\t" |
543 "punpcklbw %%mm7, %%mm4 \n\t" | 543 "punpcklbw %%mm7, %%mm4 \n\t" |
544 "punpckhbw %%mm7, %%mm3 \n\t" | 544 "punpckhbw %%mm7, %%mm3 \n\t" |
545 "punpckhbw %%mm7, %%mm5 \n\t" | 545 "punpckhbw %%mm7, %%mm5 \n\t" |
546 "paddusw %%mm2, %%mm4 \n\t" | 546 "paddusw %%mm2, %%mm4 \n\t" |
547 "paddusw %%mm3, %%mm5 \n\t" | 547 "paddusw %%mm3, %%mm5 \n\t" |
548 "paddusw %%mm6, %%mm0 \n\t" | 548 "paddusw %%mm6, %%mm0 \n\t" |
549 "paddusw %%mm6, %%mm1 \n\t" | 549 "paddusw %%mm6, %%mm1 \n\t" |
550 "paddusw %%mm4, %%mm0 \n\t" | 550 "paddusw %%mm4, %%mm0 \n\t" |
551 "paddusw %%mm5, %%mm1 \n\t" | 551 "paddusw %%mm5, %%mm1 \n\t" |
552 "psrlw $2, %%mm0 \n\t" | 552 "psrlw $2, %%mm0 \n\t" |
553 "psrlw $2, %%mm1 \n\t" | 553 "psrlw $2, %%mm1 \n\t" |
554 "movq (%2, %%"REG_a"), %%mm3 \n\t" | 554 "movq (%2, %%"REG_a"), %%mm3 \n\t" |
555 "packuswb %%mm1, %%mm0 \n\t" | 555 "packuswb %%mm1, %%mm0 \n\t" |
556 "pcmpeqd %%mm2, %%mm2 \n\t" | 556 "pcmpeqd %%mm2, %%mm2 \n\t" |
557 "paddb %%mm2, %%mm2 \n\t" | 557 "paddb %%mm2, %%mm2 \n\t" |
558 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) | 558 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) |
559 "movq %%mm1, (%2, %%"REG_a") \n\t" | 559 "movq %%mm1, (%2, %%"REG_a") \n\t" |
560 "add %3, %%"REG_a" \n\t" | 560 "add %3, %%"REG_a" \n\t" |
561 | 561 |
562 "subl $2, %0 \n\t" | 562 "subl $2, %0 \n\t" |
563 "jnz 1b \n\t" | 563 "jnz 1b \n\t" |
564 :"+g"(h), "+S"(pixels) | 564 :"+g"(h), "+S"(pixels) |
565 :"D"(block), "r"((long)line_size) | 565 :"D"(block), "r"((long)line_size) |
566 :REG_a, "memory"); | 566 :REG_a, "memory"); |
567 } | 567 } |
568 | 568 |
569 //FIXME optimize | 569 //FIXME optimize |
570 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ | 570 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
571 DEF(put, pixels8_y2)(block , pixels , line_size, h); | 571 DEF(put, pixels8_y2)(block , pixels , line_size, h); |