comparison libpostproc/postprocess_template.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents 403183bbb505
children 0b546eab515d
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
56 56
57 #ifdef HAVE_MMX2 57 #ifdef HAVE_MMX2
58 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 58 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
59 #elif defined (HAVE_MMX) 59 #elif defined (HAVE_MMX)
60 #define PMINUB(b,a,t) \ 60 #define PMINUB(b,a,t) \
61 "movq " #a ", " #t " \n\t"\ 61 "movq " #a ", " #t " \n\t"\
62 "psubusb " #b ", " #t " \n\t"\ 62 "psubusb " #b ", " #t " \n\t"\
63 "psubb " #t ", " #a " \n\t" 63 "psubb " #t ", " #a " \n\t"
64 #endif 64 #endif
65 65
66 #ifdef HAVE_MMX2 66 #ifdef HAVE_MMX2
67 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 67 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
68 #elif defined (HAVE_MMX) 68 #elif defined (HAVE_MMX)
69 #define PMAXUB(a,b) \ 69 #define PMAXUB(a,b) \
70 "psubusb " #a ", " #b " \n\t"\ 70 "psubusb " #a ", " #b " \n\t"\
71 "paddb " #a ", " #b " \n\t" 71 "paddb " #a ", " #b " \n\t"
72 #endif 72 #endif
73 73
74 //FIXME? |255-0| = 1 (shouldnt be a problem ...) 74 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
75 #ifdef HAVE_MMX 75 #ifdef HAVE_MMX
76 /** 76 /**
77 * Check if the middle 8x8 Block in the given 8x16 block is flat 77 * Check if the middle 8x8 Block in the given 8x16 block is flat
78 */ 78 */
79 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 79 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
80 int numEq= 0, dcOk; 80 int numEq= 0, dcOk;
81 src+= stride*4; // src points to begin of the 8x8 Block 81 src+= stride*4; // src points to begin of the 8x8 Block
82 asm volatile( 82 asm volatile(
83 "movq %0, %%mm7 \n\t" 83 "movq %0, %%mm7 \n\t"
84 "movq %1, %%mm6 \n\t" 84 "movq %1, %%mm6 \n\t"
85 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 85 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
86 ); 86 );
87 87
88 asm volatile( 88 asm volatile(
89 "lea (%2, %3), %%"REG_a" \n\t" 89 "lea (%2, %3), %%"REG_a" \n\t"
90 // 0 1 2 3 4 5 6 7 8 9 90 // 0 1 2 3 4 5 6 7 8 9
91 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 91 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
92 92
93 "movq (%2), %%mm0 \n\t" 93 "movq (%2), %%mm0 \n\t"
94 "movq (%%"REG_a"), %%mm1 \n\t" 94 "movq (%%"REG_a"), %%mm1 \n\t"
95 "movq %%mm0, %%mm3 \n\t" 95 "movq %%mm0, %%mm3 \n\t"
96 "movq %%mm0, %%mm4 \n\t" 96 "movq %%mm0, %%mm4 \n\t"
97 PMAXUB(%%mm1, %%mm4) 97 PMAXUB(%%mm1, %%mm4)
98 PMINUB(%%mm1, %%mm3, %%mm5) 98 PMINUB(%%mm1, %%mm3, %%mm5)
99 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 99 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
100 "paddb %%mm7, %%mm0 \n\t" 100 "paddb %%mm7, %%mm0 \n\t"
101 "pcmpgtb %%mm6, %%mm0 \n\t" 101 "pcmpgtb %%mm6, %%mm0 \n\t"
102 102
103 "movq (%%"REG_a",%3), %%mm2 \n\t" 103 "movq (%%"REG_a",%3), %%mm2 \n\t"
104 PMAXUB(%%mm2, %%mm4) 104 PMAXUB(%%mm2, %%mm4)
105 PMINUB(%%mm2, %%mm3, %%mm5) 105 PMINUB(%%mm2, %%mm3, %%mm5)
106 "psubb %%mm2, %%mm1 \n\t" 106 "psubb %%mm2, %%mm1 \n\t"
107 "paddb %%mm7, %%mm1 \n\t" 107 "paddb %%mm7, %%mm1 \n\t"
108 "pcmpgtb %%mm6, %%mm1 \n\t" 108 "pcmpgtb %%mm6, %%mm1 \n\t"
109 "paddb %%mm1, %%mm0 \n\t" 109 "paddb %%mm1, %%mm0 \n\t"
110 110
111 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 111 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
112 PMAXUB(%%mm1, %%mm4) 112 PMAXUB(%%mm1, %%mm4)
113 PMINUB(%%mm1, %%mm3, %%mm5) 113 PMINUB(%%mm1, %%mm3, %%mm5)
114 "psubb %%mm1, %%mm2 \n\t" 114 "psubb %%mm1, %%mm2 \n\t"
115 "paddb %%mm7, %%mm2 \n\t" 115 "paddb %%mm7, %%mm2 \n\t"
116 "pcmpgtb %%mm6, %%mm2 \n\t" 116 "pcmpgtb %%mm6, %%mm2 \n\t"
117 "paddb %%mm2, %%mm0 \n\t" 117 "paddb %%mm2, %%mm0 \n\t"
118 118
119 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 119 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
120 120
121 "movq (%2, %3, 4), %%mm2 \n\t" 121 "movq (%2, %3, 4), %%mm2 \n\t"
122 PMAXUB(%%mm2, %%mm4) 122 PMAXUB(%%mm2, %%mm4)
123 PMINUB(%%mm2, %%mm3, %%mm5) 123 PMINUB(%%mm2, %%mm3, %%mm5)
124 "psubb %%mm2, %%mm1 \n\t" 124 "psubb %%mm2, %%mm1 \n\t"
125 "paddb %%mm7, %%mm1 \n\t" 125 "paddb %%mm7, %%mm1 \n\t"
126 "pcmpgtb %%mm6, %%mm1 \n\t" 126 "pcmpgtb %%mm6, %%mm1 \n\t"
127 "paddb %%mm1, %%mm0 \n\t" 127 "paddb %%mm1, %%mm0 \n\t"
128 128
129 "movq (%%"REG_a"), %%mm1 \n\t" 129 "movq (%%"REG_a"), %%mm1 \n\t"
130 PMAXUB(%%mm1, %%mm4) 130 PMAXUB(%%mm1, %%mm4)
131 PMINUB(%%mm1, %%mm3, %%mm5) 131 PMINUB(%%mm1, %%mm3, %%mm5)
132 "psubb %%mm1, %%mm2 \n\t" 132 "psubb %%mm1, %%mm2 \n\t"
133 "paddb %%mm7, %%mm2 \n\t" 133 "paddb %%mm7, %%mm2 \n\t"
134 "pcmpgtb %%mm6, %%mm2 \n\t" 134 "pcmpgtb %%mm6, %%mm2 \n\t"
135 "paddb %%mm2, %%mm0 \n\t" 135 "paddb %%mm2, %%mm0 \n\t"
136 136
137 "movq (%%"REG_a", %3), %%mm2 \n\t" 137 "movq (%%"REG_a", %3), %%mm2 \n\t"
138 PMAXUB(%%mm2, %%mm4) 138 PMAXUB(%%mm2, %%mm4)
139 PMINUB(%%mm2, %%mm3, %%mm5) 139 PMINUB(%%mm2, %%mm3, %%mm5)
140 "psubb %%mm2, %%mm1 \n\t" 140 "psubb %%mm2, %%mm1 \n\t"
141 "paddb %%mm7, %%mm1 \n\t" 141 "paddb %%mm7, %%mm1 \n\t"
142 "pcmpgtb %%mm6, %%mm1 \n\t" 142 "pcmpgtb %%mm6, %%mm1 \n\t"
143 "paddb %%mm1, %%mm0 \n\t" 143 "paddb %%mm1, %%mm0 \n\t"
144 144
145 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 145 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
146 PMAXUB(%%mm1, %%mm4) 146 PMAXUB(%%mm1, %%mm4)
147 PMINUB(%%mm1, %%mm3, %%mm5) 147 PMINUB(%%mm1, %%mm3, %%mm5)
148 "psubb %%mm1, %%mm2 \n\t" 148 "psubb %%mm1, %%mm2 \n\t"
149 "paddb %%mm7, %%mm2 \n\t" 149 "paddb %%mm7, %%mm2 \n\t"
150 "pcmpgtb %%mm6, %%mm2 \n\t" 150 "pcmpgtb %%mm6, %%mm2 \n\t"
151 "paddb %%mm2, %%mm0 \n\t" 151 "paddb %%mm2, %%mm0 \n\t"
152 "psubusb %%mm3, %%mm4 \n\t" 152 "psubusb %%mm3, %%mm4 \n\t"
153 153
154 " \n\t" 154 " \n\t"
155 #ifdef HAVE_MMX2 155 #ifdef HAVE_MMX2
156 "pxor %%mm7, %%mm7 \n\t" 156 "pxor %%mm7, %%mm7 \n\t"
157 "psadbw %%mm7, %%mm0 \n\t" 157 "psadbw %%mm7, %%mm0 \n\t"
158 #else 158 #else
159 "movq %%mm0, %%mm1 \n\t" 159 "movq %%mm0, %%mm1 \n\t"
160 "psrlw $8, %%mm0 \n\t" 160 "psrlw $8, %%mm0 \n\t"
161 "paddb %%mm1, %%mm0 \n\t" 161 "paddb %%mm1, %%mm0 \n\t"
162 "movq %%mm0, %%mm1 \n\t" 162 "movq %%mm0, %%mm1 \n\t"
163 "psrlq $16, %%mm0 \n\t" 163 "psrlq $16, %%mm0 \n\t"
164 "paddb %%mm1, %%mm0 \n\t" 164 "paddb %%mm1, %%mm0 \n\t"
165 "movq %%mm0, %%mm1 \n\t" 165 "movq %%mm0, %%mm1 \n\t"
166 "psrlq $32, %%mm0 \n\t" 166 "psrlq $32, %%mm0 \n\t"
167 "paddb %%mm1, %%mm0 \n\t" 167 "paddb %%mm1, %%mm0 \n\t"
168 #endif 168 #endif
169 "movq %4, %%mm7 \n\t" // QP,..., QP 169 "movq %4, %%mm7 \n\t" // QP,..., QP
170 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 170 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
171 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 171 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
172 "packssdw %%mm4, %%mm4 \n\t" 172 "packssdw %%mm4, %%mm4 \n\t"
173 "movd %%mm0, %0 \n\t" 173 "movd %%mm0, %0 \n\t"
174 "movd %%mm4, %1 \n\t" 174 "movd %%mm4, %1 \n\t"
175 175
176 : "=r" (numEq), "=r" (dcOk) 176 : "=r" (numEq), "=r" (dcOk)
177 : "r" (src), "r" ((long)stride), "m" (c->pQPb) 177 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
178 : "%"REG_a 178 : "%"REG_a
179 ); 179 );
180 180
181 numEq= (-numEq) &0xFF; 181 numEq= (-numEq) &0xFF;
182 if(numEq > c->ppMode.flatnessThreshold){ 182 if(numEq > c->ppMode.flatnessThreshold){
183 if(dcOk) return 0; 183 if(dcOk) return 0;
184 else return 1; 184 else return 1;
185 }else{ 185 }else{
186 return 2; 186 return 2;
187 } 187 }
194 */ 194 */
195 #ifndef HAVE_ALTIVEC 195 #ifndef HAVE_ALTIVEC
196 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 196 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
197 { 197 {
198 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 198 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
199 src+= stride*3; 199 src+= stride*3;
200 asm volatile( //"movv %0 %1 %2\n\t" 200 asm volatile( //"movv %0 %1 %2\n\t"
201 "movq %2, %%mm0 \n\t" // QP,..., QP 201 "movq %2, %%mm0 \n\t" // QP,..., QP
202 "pxor %%mm4, %%mm4 \n\t" 202 "pxor %%mm4, %%mm4 \n\t"
203 203
204 "movq (%0), %%mm6 \n\t" 204 "movq (%0), %%mm6 \n\t"
205 "movq (%0, %1), %%mm5 \n\t" 205 "movq (%0, %1), %%mm5 \n\t"
206 "movq %%mm5, %%mm1 \n\t" 206 "movq %%mm5, %%mm1 \n\t"
207 "movq %%mm6, %%mm2 \n\t" 207 "movq %%mm6, %%mm2 \n\t"
208 "psubusb %%mm6, %%mm5 \n\t" 208 "psubusb %%mm6, %%mm5 \n\t"
209 "psubusb %%mm1, %%mm2 \n\t" 209 "psubusb %%mm1, %%mm2 \n\t"
210 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 210 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
211 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 211 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
212 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 212 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
213 213
214 "pand %%mm2, %%mm6 \n\t" 214 "pand %%mm2, %%mm6 \n\t"
215 "pandn %%mm1, %%mm2 \n\t" 215 "pandn %%mm1, %%mm2 \n\t"
216 "por %%mm2, %%mm6 \n\t"// First Line to Filter 216 "por %%mm2, %%mm6 \n\t"// First Line to Filter
217 217
218 "movq (%0, %1, 8), %%mm5 \n\t" 218 "movq (%0, %1, 8), %%mm5 \n\t"
219 "lea (%0, %1, 4), %%"REG_a" \n\t" 219 "lea (%0, %1, 4), %%"REG_a" \n\t"
220 "lea (%0, %1, 8), %%"REG_c" \n\t" 220 "lea (%0, %1, 8), %%"REG_c" \n\t"
221 "sub %1, %%"REG_c" \n\t" 221 "sub %1, %%"REG_c" \n\t"
222 "add %1, %0 \n\t" // %0 points to line 1 not 0 222 "add %1, %0 \n\t" // %0 points to line 1 not 0
223 "movq (%0, %1, 8), %%mm7 \n\t" 223 "movq (%0, %1, 8), %%mm7 \n\t"
224 "movq %%mm5, %%mm1 \n\t" 224 "movq %%mm5, %%mm1 \n\t"
225 "movq %%mm7, %%mm2 \n\t" 225 "movq %%mm7, %%mm2 \n\t"
226 "psubusb %%mm7, %%mm5 \n\t" 226 "psubusb %%mm7, %%mm5 \n\t"
227 "psubusb %%mm1, %%mm2 \n\t" 227 "psubusb %%mm1, %%mm2 \n\t"
228 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 228 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
229 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 229 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
230 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 230 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
231 231
232 "pand %%mm2, %%mm7 \n\t" 232 "pand %%mm2, %%mm7 \n\t"
233 "pandn %%mm1, %%mm2 \n\t" 233 "pandn %%mm1, %%mm2 \n\t"
234 "por %%mm2, %%mm7 \n\t" // First Line to Filter 234 "por %%mm2, %%mm7 \n\t" // First Line to Filter
235 235
236 236
237 // 1 2 3 4 5 6 7 8 237 // 1 2 3 4 5 6 7 8
238 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 238 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
239 // 6 4 2 2 1 1 239 // 6 4 2 2 1 1
240 // 6 4 4 2 240 // 6 4 4 2
241 // 6 8 2 241 // 6 8 2
242 242
243 "movq (%0, %1), %%mm0 \n\t" // 1 243 "movq (%0, %1), %%mm0 \n\t" // 1
244 "movq %%mm0, %%mm1 \n\t" // 1 244 "movq %%mm0, %%mm1 \n\t" // 1
245 PAVGB(%%mm6, %%mm0) //1 1 /2 245 PAVGB(%%mm6, %%mm0) //1 1 /2
246 PAVGB(%%mm6, %%mm0) //3 1 /4 246 PAVGB(%%mm6, %%mm0) //3 1 /4
247 247
248 "movq (%0, %1, 4), %%mm2 \n\t" // 1 248 "movq (%0, %1, 4), %%mm2 \n\t" // 1
249 "movq %%mm2, %%mm5 \n\t" // 1 249 "movq %%mm2, %%mm5 \n\t" // 1
250 PAVGB((%%REGa), %%mm2) // 11 /2 250 PAVGB((%%REGa), %%mm2) // 11 /2
251 PAVGB((%0, %1, 2), %%mm2) // 211 /4 251 PAVGB((%0, %1, 2), %%mm2) // 211 /4
252 "movq %%mm2, %%mm3 \n\t" // 211 /4 252 "movq %%mm2, %%mm3 \n\t" // 211 /4
253 "movq (%0), %%mm4 \n\t" // 1 253 "movq (%0), %%mm4 \n\t" // 1
254 PAVGB(%%mm4, %%mm3) // 4 211 /8 254 PAVGB(%%mm4, %%mm3) // 4 211 /8
255 PAVGB(%%mm0, %%mm3) //642211 /16 255 PAVGB(%%mm0, %%mm3) //642211 /16
256 "movq %%mm3, (%0) \n\t" // X 256 "movq %%mm3, (%0) \n\t" // X
257 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 257 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
258 "movq %%mm1, %%mm0 \n\t" // 1 258 "movq %%mm1, %%mm0 \n\t" // 1
259 PAVGB(%%mm6, %%mm0) //1 1 /2 259 PAVGB(%%mm6, %%mm0) //1 1 /2
260 "movq %%mm4, %%mm3 \n\t" // 1 260 "movq %%mm4, %%mm3 \n\t" // 1
261 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 261 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
262 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 262 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2
263 PAVGB((%%REGa), %%mm5) // 211 /4 263 PAVGB((%%REGa), %%mm5) // 211 /4
264 PAVGB(%%mm5, %%mm3) // 2 2211 /8 264 PAVGB(%%mm5, %%mm3) // 2 2211 /8
265 PAVGB(%%mm0, %%mm3) //4242211 /16 265 PAVGB(%%mm0, %%mm3) //4242211 /16
266 "movq %%mm3, (%0,%1) \n\t" // X 266 "movq %%mm3, (%0,%1) \n\t" // X
267 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 267 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
268 PAVGB(%%mm4, %%mm6) //11 /2 268 PAVGB(%%mm4, %%mm6) //11 /2
269 "movq (%%"REG_c"), %%mm0 \n\t" // 1 269 "movq (%%"REG_c"), %%mm0 \n\t" // 1
270 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 270 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2
271 "movq %%mm0, %%mm3 \n\t" // 11/2 271 "movq %%mm0, %%mm3 \n\t" // 11/2
272 PAVGB(%%mm1, %%mm0) // 2 11/4 272 PAVGB(%%mm1, %%mm0) // 2 11/4
273 PAVGB(%%mm6, %%mm0) //222 11/8 273 PAVGB(%%mm6, %%mm0) //222 11/8
274 PAVGB(%%mm2, %%mm0) //22242211/16 274 PAVGB(%%mm2, %%mm0) //22242211/16
275 "movq (%0, %1, 2), %%mm2 \n\t" // 1 275 "movq (%0, %1, 2), %%mm2 \n\t" // 1
276 "movq %%mm0, (%0, %1, 2) \n\t" // X 276 "movq %%mm0, (%0, %1, 2) \n\t" // X
277 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 277 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
278 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 278 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
279 PAVGB((%%REGc), %%mm0) // 11 /2 279 PAVGB((%%REGc), %%mm0) // 11 /2
280 PAVGB(%%mm0, %%mm6) //11 11 /4 280 PAVGB(%%mm0, %%mm6) //11 11 /4
281 PAVGB(%%mm1, %%mm4) // 11 /2 281 PAVGB(%%mm1, %%mm4) // 11 /2
282 PAVGB(%%mm2, %%mm1) // 11 /2 282 PAVGB(%%mm2, %%mm1) // 11 /2
283 PAVGB(%%mm1, %%mm6) //1122 11 /8 283 PAVGB(%%mm1, %%mm6) //1122 11 /8
284 PAVGB(%%mm5, %%mm6) //112242211 /16 284 PAVGB(%%mm5, %%mm6) //112242211 /16
285 "movq (%%"REG_a"), %%mm5 \n\t" // 1 285 "movq (%%"REG_a"), %%mm5 \n\t" // 1
286 "movq %%mm6, (%%"REG_a") \n\t" // X 286 "movq %%mm6, (%%"REG_a") \n\t" // X
287 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 287 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
288 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 288 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1
289 PAVGB(%%mm7, %%mm6) // 11 /2 289 PAVGB(%%mm7, %%mm6) // 11 /2
290 PAVGB(%%mm4, %%mm6) // 11 11 /4 290 PAVGB(%%mm4, %%mm6) // 11 11 /4
291 PAVGB(%%mm3, %%mm6) // 11 2211 /8 291 PAVGB(%%mm3, %%mm6) // 11 2211 /8
292 PAVGB(%%mm5, %%mm2) // 11 /2 292 PAVGB(%%mm5, %%mm2) // 11 /2
293 "movq (%0, %1, 4), %%mm4 \n\t" // 1 293 "movq (%0, %1, 4), %%mm4 \n\t" // 1
294 PAVGB(%%mm4, %%mm2) // 112 /4 294 PAVGB(%%mm4, %%mm2) // 112 /4
295 PAVGB(%%mm2, %%mm6) // 112242211 /16 295 PAVGB(%%mm2, %%mm6) // 112242211 /16
296 "movq %%mm6, (%0, %1, 4) \n\t" // X 296 "movq %%mm6, (%0, %1, 4) \n\t" // X
297 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 297 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
298 PAVGB(%%mm7, %%mm1) // 11 2 /4 298 PAVGB(%%mm7, %%mm1) // 11 2 /4
299 PAVGB(%%mm4, %%mm5) // 11 /2 299 PAVGB(%%mm4, %%mm5) // 11 /2
300 PAVGB(%%mm5, %%mm0) // 11 11 /4 300 PAVGB(%%mm5, %%mm0) // 11 11 /4
301 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 301 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1
302 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 302 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
303 PAVGB(%%mm0, %%mm1) // 11224222 /16 303 PAVGB(%%mm0, %%mm1) // 11224222 /16
304 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X 304 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X
305 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 305 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
306 PAVGB((%%REGc), %%mm2) // 112 4 /8 306 PAVGB((%%REGc), %%mm2) // 112 4 /8
307 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 307 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
308 PAVGB(%%mm0, %%mm6) // 1 1 /2 308 PAVGB(%%mm0, %%mm6) // 1 1 /2
309 PAVGB(%%mm7, %%mm6) // 1 12 /4 309 PAVGB(%%mm7, %%mm6) // 1 12 /4
310 PAVGB(%%mm2, %%mm6) // 1122424 /4 310 PAVGB(%%mm2, %%mm6) // 1122424 /4
311 "movq %%mm6, (%%"REG_c") \n\t" // X 311 "movq %%mm6, (%%"REG_c") \n\t" // X
312 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 312 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
313 PAVGB(%%mm7, %%mm5) // 11 2 /4 313 PAVGB(%%mm7, %%mm5) // 11 2 /4
314 PAVGB(%%mm7, %%mm5) // 11 6 /8 314 PAVGB(%%mm7, %%mm5) // 11 6 /8
315 315
316 PAVGB(%%mm3, %%mm0) // 112 /4 316 PAVGB(%%mm3, %%mm0) // 112 /4
317 PAVGB(%%mm0, %%mm5) // 112246 /16 317 PAVGB(%%mm0, %%mm5) // 112246 /16
318 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X 318 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X
319 "sub %1, %0 \n\t" 319 "sub %1, %0 \n\t"
320 320
321 : 321 :
322 : "r" (src), "r" ((long)stride), "m" (c->pQPb) 322 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
323 : "%"REG_a, "%"REG_c 323 : "%"REG_a, "%"REG_c
324 ); 324 );
325 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 325 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
326 const int l1= stride; 326 const int l1= stride;
327 const int l2= stride + l1; 327 const int l2= stride + l1;
328 const int l3= stride + l2; 328 const int l3= stride + l2;
329 const int l4= stride + l3; 329 const int l4= stride + l3;
330 const int l5= stride + l4; 330 const int l5= stride + l4;
331 const int l6= stride + l5; 331 const int l6= stride + l5;
332 const int l7= stride + l6; 332 const int l7= stride + l6;
333 const int l8= stride + l7; 333 const int l8= stride + l7;
334 const int l9= stride + l8; 334 const int l9= stride + l8;
335 int x; 335 int x;
336 src+= stride*3; 336 src+= stride*3;
337 for(x=0; x<BLOCK_SIZE; x++) 337 for(x=0; x<BLOCK_SIZE; x++)
338 { 338 {
339 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; 339 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
340 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; 340 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
341 341
342 int sums[10]; 342 int sums[10];
343 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 343 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
344 sums[1] = sums[0] - first + src[l4]; 344 sums[1] = sums[0] - first + src[l4];
345 sums[2] = sums[1] - first + src[l5]; 345 sums[2] = sums[1] - first + src[l5];
346 sums[3] = sums[2] - first + src[l6]; 346 sums[3] = sums[2] - first + src[l6];
347 sums[4] = sums[3] - first + src[l7]; 347 sums[4] = sums[3] - first + src[l7];
348 sums[5] = sums[4] - src[l1] + src[l8]; 348 sums[5] = sums[4] - src[l1] + src[l8];
349 sums[6] = sums[5] - src[l2] + last; 349 sums[6] = sums[5] - src[l2] + last;
350 sums[7] = sums[6] - src[l3] + last; 350 sums[7] = sums[6] - src[l3] + last;
351 sums[8] = sums[7] - src[l4] + last; 351 sums[8] = sums[7] - src[l4] + last;
352 sums[9] = sums[8] - src[l5] + last; 352 sums[9] = sums[8] - src[l5] + last;
353 353
354 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 354 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
355 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 355 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
356 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 356 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
357 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 357 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
358 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 358 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
359 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 359 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
360 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 360 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
361 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 361 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
362 362
363 src++; 363 src++;
364 } 364 }
365 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 365 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
366 } 366 }
367 #endif //HAVE_ALTIVEC 367 #endif //HAVE_ALTIVEC
368 368
369 #if 0 369 #if 0
370 /** 370 /**
371 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar 371 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
372 * values are correctly clipped (MMX2) 372 * values are correctly clipped (MMX2)
373 * values are wraparound (C) 373 * values are wraparound (C)
374 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient 374 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
375 0 8 16 24 375 0 8 16 24
376 x = 8 376 x = 8
377 x/2 = 4 377 x/2 = 4
378 x/8 = 1 378 x/8 = 1
379 1 12 12 23 379 1 12 12 23
380 */ 380 */
381 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) 381 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
382 { 382 {
383 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 383 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
384 src+= stride*3; 384 src+= stride*3;
385 // FIXME rounding 385 // FIXME rounding
386 asm volatile( 386 asm volatile(
387 "pxor %%mm7, %%mm7 \n\t" // 0 387 "pxor %%mm7, %%mm7 \n\t" // 0
388 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE 388 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
389 "leal (%0, %1), %%"REG_a" \n\t" 389 "leal (%0, %1), %%"REG_a" \n\t"
390 "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" 390 "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t"
391 // 0 1 2 3 4 5 6 7 8 9 391 // 0 1 2 3 4 5 6 7 8 9
392 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 392 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
393 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP 393 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
394 "movq %%mm0, %%mm1 \n\t" // QP,..., QP 394 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
395 "paddusb "MANGLE(b02)", %%mm0 \n\t" 395 "paddusb "MANGLE(b02)", %%mm0 \n\t"
396 "psrlw $2, %%mm0 \n\t" 396 "psrlw $2, %%mm0 \n\t"
397 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 397 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
398 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... 398 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
399 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 399 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
400 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 400 "movq (%%"REG_c"), %%mm3 \n\t" // line 5
401 "movq %%mm2, %%mm4 \n\t" // line 4 401 "movq %%mm2, %%mm4 \n\t" // line 4
402 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 402 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
403 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 403 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
404 PAVGB(%%mm3, %%mm5) 404 PAVGB(%%mm3, %%mm5)
405 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 405 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
406 "psubusb %%mm3, %%mm4 \n\t" 406 "psubusb %%mm3, %%mm4 \n\t"
407 "psubusb %%mm2, %%mm3 \n\t" 407 "psubusb %%mm2, %%mm3 \n\t"
408 "por %%mm3, %%mm4 \n\t" // |l4 - l5| 408 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
409 "psubusb %%mm0, %%mm4 \n\t" 409 "psubusb %%mm0, %%mm4 \n\t"
410 "pcmpeqb %%mm7, %%mm4 \n\t" 410 "pcmpeqb %%mm7, %%mm4 \n\t"
411 "pand %%mm4, %%mm5 \n\t" // d/2 411 "pand %%mm4, %%mm5 \n\t" // d/2
412 412
413 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 413 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
414 "paddb %%mm5, %%mm2 \n\t" 414 "paddb %%mm5, %%mm2 \n\t"
415 // "psubb %%mm6, %%mm2 \n\t" 415 // "psubb %%mm6, %%mm2 \n\t"
416 "movq %%mm2, (%0,%1, 4) \n\t" 416 "movq %%mm2, (%0,%1, 4) \n\t"
417 417
418 "movq (%%"REG_c"), %%mm2 \n\t" 418 "movq (%%"REG_c"), %%mm2 \n\t"
419 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 419 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
420 "psubb %%mm5, %%mm2 \n\t" 420 "psubb %%mm5, %%mm2 \n\t"
421 // "psubb %%mm6, %%mm2 \n\t" 421 // "psubb %%mm6, %%mm2 \n\t"
422 "movq %%mm2, (%%"REG_c") \n\t" 422 "movq %%mm2, (%%"REG_c") \n\t"
423 423
424 "paddb %%mm6, %%mm5 \n\t" 424 "paddb %%mm6, %%mm5 \n\t"
425 "psrlw $2, %%mm5 \n\t" 425 "psrlw $2, %%mm5 \n\t"
426 "pand "MANGLE(b3F)", %%mm5 \n\t" 426 "pand "MANGLE(b3F)", %%mm5 \n\t"
427 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 427 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
428 428
429 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" 429 "movq (%%"REG_a", %1, 2), %%mm2 \n\t"
430 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 430 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
431 "paddsb %%mm5, %%mm2 \n\t" 431 "paddsb %%mm5, %%mm2 \n\t"
432 "psubb %%mm6, %%mm2 \n\t" 432 "psubb %%mm6, %%mm2 \n\t"
433 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 433 "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
434 434
435 "movq (%%"REG_c", %1), %%mm2 \n\t" 435 "movq (%%"REG_c", %1), %%mm2 \n\t"
436 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 436 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
437 "psubsb %%mm5, %%mm2 \n\t" 437 "psubsb %%mm5, %%mm2 \n\t"
438 "psubb %%mm6, %%mm2 \n\t" 438 "psubb %%mm6, %%mm2 \n\t"
439 "movq %%mm2, (%%"REG_c", %1) \n\t" 439 "movq %%mm2, (%%"REG_c", %1) \n\t"
440 440
441 : 441 :
442 : "r" (src), "r" ((long)stride) 442 : "r" (src), "r" ((long)stride)
443 : "%"REG_a, "%"REG_c 443 : "%"REG_a, "%"REG_c
444 ); 444 );
445 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 445 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
446 const int l1= stride; 446 const int l1= stride;
447 const int l2= stride + l1; 447 const int l2= stride + l1;
448 const int l3= stride + l2; 448 const int l3= stride + l2;
449 const int l4= stride + l3; 449 const int l4= stride + l3;
450 const int l5= stride + l4; 450 const int l5= stride + l4;
451 const int l6= stride + l5; 451 const int l6= stride + l5;
452 // const int l7= stride + l6; 452 // const int l7= stride + l6;
453 // const int l8= stride + l7; 453 // const int l8= stride + l7;
454 // const int l9= stride + l8; 454 // const int l9= stride + l8;
455 int x; 455 int x;
456 const int QP15= QP + (QP>>2); 456 const int QP15= QP + (QP>>2);
457 src+= stride*3; 457 src+= stride*3;
458 for(x=0; x<BLOCK_SIZE; x++) 458 for(x=0; x<BLOCK_SIZE; x++)
459 { 459 {
460 const int v = (src[x+l5] - src[x+l4]); 460 const int v = (src[x+l5] - src[x+l4]);
461 if(ABS(v) < QP15) 461 if(ABS(v) < QP15)
462 { 462 {
463 src[x+l3] +=v>>3; 463 src[x+l3] +=v>>3;
464 src[x+l4] +=v>>1; 464 src[x+l4] +=v>>1;
465 src[x+l5] -=v>>1; 465 src[x+l5] -=v>>1;
466 src[x+l6] -=v>>3; 466 src[x+l6] -=v>>3;
467 467
468 } 468 }
469 } 469 }
470 470
471 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 471 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
472 } 472 }
473 #endif //0 473 #endif //0
474 474
480 * MMX2 version does correct clipping C version doesnt 480 * MMX2 version does correct clipping C version doesnt
481 */ 481 */
482 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 482 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
483 { 483 {
484 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 484 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
485 src+= stride*3; 485 src+= stride*3;
486 486
487 asm volatile( 487 asm volatile(
488 "pxor %%mm7, %%mm7 \n\t" // 0 488 "pxor %%mm7, %%mm7 \n\t" // 0
489 "lea (%0, %1), %%"REG_a" \n\t" 489 "lea (%0, %1), %%"REG_a" \n\t"
490 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 490 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
491 // 0 1 2 3 4 5 6 7 8 9 491 // 0 1 2 3 4 5 6 7 8 9
492 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 492 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
493 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 493 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
494 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 494 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
495 "movq %%mm1, %%mm2 \n\t" // line 4 495 "movq %%mm1, %%mm2 \n\t" // line 4
496 "psubusb %%mm0, %%mm1 \n\t" 496 "psubusb %%mm0, %%mm1 \n\t"
497 "psubusb %%mm2, %%mm0 \n\t" 497 "psubusb %%mm2, %%mm0 \n\t"
498 "por %%mm1, %%mm0 \n\t" // |l2 - l3| 498 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
499 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 499 "movq (%%"REG_c"), %%mm3 \n\t" // line 5
500 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 500 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6
501 "movq %%mm3, %%mm5 \n\t" // line 5 501 "movq %%mm3, %%mm5 \n\t" // line 5
502 "psubusb %%mm4, %%mm3 \n\t" 502 "psubusb %%mm4, %%mm3 \n\t"
503 "psubusb %%mm5, %%mm4 \n\t" 503 "psubusb %%mm5, %%mm4 \n\t"
504 "por %%mm4, %%mm3 \n\t" // |l5 - l6| 504 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
505 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 505 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
506 "movq %%mm2, %%mm1 \n\t" // line 4 506 "movq %%mm2, %%mm1 \n\t" // line 4
507 "psubusb %%mm5, %%mm2 \n\t" 507 "psubusb %%mm5, %%mm2 \n\t"
508 "movq %%mm2, %%mm4 \n\t" 508 "movq %%mm2, %%mm4 \n\t"
509 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 509 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
510 "psubusb %%mm1, %%mm5 \n\t" 510 "psubusb %%mm1, %%mm5 \n\t"
511 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 511 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
512 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 512 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
513 "movq %%mm4, %%mm3 \n\t" // d 513 "movq %%mm4, %%mm3 \n\t" // d
514 "movq %2, %%mm0 \n\t" 514 "movq %2, %%mm0 \n\t"
515 "paddusb %%mm0, %%mm0 \n\t" 515 "paddusb %%mm0, %%mm0 \n\t"
516 "psubusb %%mm0, %%mm4 \n\t" 516 "psubusb %%mm0, %%mm4 \n\t"
517 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 517 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
518 "psubusb "MANGLE(b01)", %%mm3 \n\t" 518 "psubusb "MANGLE(b01)", %%mm3 \n\t"
519 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 519 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
520 520
521 PAVGB(%%mm7, %%mm3) // d/2 521 PAVGB(%%mm7, %%mm3) // d/2
522 "movq %%mm3, %%mm1 \n\t" // d/2 522 "movq %%mm3, %%mm1 \n\t" // d/2
523 PAVGB(%%mm7, %%mm3) // d/4 523 PAVGB(%%mm7, %%mm3) // d/4
524 PAVGB(%%mm1, %%mm3) // 3*d/8 524 PAVGB(%%mm1, %%mm3) // 3*d/8
525 525
526 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 526 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
527 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 527 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
528 "psubusb %%mm3, %%mm0 \n\t" 528 "psubusb %%mm3, %%mm0 \n\t"
529 "pxor %%mm2, %%mm0 \n\t" 529 "pxor %%mm2, %%mm0 \n\t"
530 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 530 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
531 531
532 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 532 "movq (%%"REG_c"), %%mm0 \n\t" // line 5
533 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 533 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
534 "paddusb %%mm3, %%mm0 \n\t" 534 "paddusb %%mm3, %%mm0 \n\t"
535 "pxor %%mm2, %%mm0 \n\t" 535 "pxor %%mm2, %%mm0 \n\t"
536 "movq %%mm0, (%%"REG_c") \n\t" // line 5 536 "movq %%mm0, (%%"REG_c") \n\t" // line 5
537 537
538 PAVGB(%%mm7, %%mm1) // d/4 538 PAVGB(%%mm7, %%mm1) // d/4
539 539
540 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 540 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
541 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 541 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
542 "psubusb %%mm1, %%mm0 \n\t" 542 "psubusb %%mm1, %%mm0 \n\t"
543 "pxor %%mm2, %%mm0 \n\t" 543 "pxor %%mm2, %%mm0 \n\t"
544 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 544 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3
545 545
546 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 546 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6
547 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 547 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
548 "paddusb %%mm1, %%mm0 \n\t" 548 "paddusb %%mm1, %%mm0 \n\t"
549 "pxor %%mm2, %%mm0 \n\t" 549 "pxor %%mm2, %%mm0 \n\t"
550 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 550 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6
551 551
552 PAVGB(%%mm7, %%mm1) // d/8 552 PAVGB(%%mm7, %%mm1) // d/8
553 553
554 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 554 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2
555 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 555 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
556 "psubusb %%mm1, %%mm0 \n\t" 556 "psubusb %%mm1, %%mm0 \n\t"
557 "pxor %%mm2, %%mm0 \n\t" 557 "pxor %%mm2, %%mm0 \n\t"
558 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 558 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2
559 559
560 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 560 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7
561 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 561 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
562 "paddusb %%mm1, %%mm0 \n\t" 562 "paddusb %%mm1, %%mm0 \n\t"
563 "pxor %%mm2, %%mm0 \n\t" 563 "pxor %%mm2, %%mm0 \n\t"
564 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 564 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7
565 565
566 : 566 :
567 : "r" (src), "r" ((long)stride), "m" (co->pQPb) 567 : "r" (src), "r" ((long)stride), "m" (co->pQPb)
568 : "%"REG_a, "%"REG_c 568 : "%"REG_a, "%"REG_c
569 ); 569 );
570 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 570 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
571 571
572 const int l1= stride; 572 const int l1= stride;
573 const int l2= stride + l1; 573 const int l2= stride + l1;
574 const int l3= stride + l2; 574 const int l3= stride + l2;
575 const int l4= stride + l3; 575 const int l4= stride + l3;
576 const int l5= stride + l4; 576 const int l5= stride + l4;
577 const int l6= stride + l5; 577 const int l6= stride + l5;
578 const int l7= stride + l6; 578 const int l7= stride + l6;
579 // const int l8= stride + l7; 579 // const int l8= stride + l7;
580 // const int l9= stride + l8; 580 // const int l9= stride + l8;
581 int x; 581 int x;
582 582
583 src+= stride*3; 583 src+= stride*3;
584 for(x=0; x<BLOCK_SIZE; x++) 584 for(x=0; x<BLOCK_SIZE; x++)
585 { 585 {
586 int a= src[l3] - src[l4]; 586 int a= src[l3] - src[l4];
587 int b= src[l4] - src[l5]; 587 int b= src[l4] - src[l5];
588 int c= src[l5] - src[l6]; 588 int c= src[l5] - src[l6];
589 589
590 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); 590 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
591 d= MAX(d, 0); 591 d= MAX(d, 0);
592 592
593 if(d < co->QP*2) 593 if(d < co->QP*2)
594 { 594 {
595 int v = d * SIGN(-b); 595 int v = d * SIGN(-b);
596 596
597 src[l2] +=v>>3; 597 src[l2] +=v>>3;
598 src[l3] +=v>>2; 598 src[l3] +=v>>2;
599 src[l4] +=(3*v)>>3; 599 src[l4] +=(3*v)>>3;
600 src[l5] -=(3*v)>>3; 600 src[l5] -=(3*v)>>3;
601 src[l6] -=v>>2; 601 src[l6] -=v>>2;
602 src[l7] -=v>>3; 602 src[l7] -=v>>3;
603 603
604 } 604 }
605 src++; 605 src++;
606 } 606 }
607 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 607 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
608 } 608 }
609 609
610 #ifndef HAVE_ALTIVEC 610 #ifndef HAVE_ALTIVEC
611 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 611 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
612 { 612 {
613 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 613 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
614 /* 614 /*
615 uint8_t tmp[16]; 615 uint8_t tmp[16];
616 const int l1= stride; 616 const int l1= stride;
617 const int l2= stride + l1; 617 const int l2= stride + l1;
618 const int l3= stride + l2; 618 const int l3= stride + l2;
619 const int l4= (int)tmp - (int)src - stride*3; 619 const int l4= (int)tmp - (int)src - stride*3;
620 const int l5= (int)tmp - (int)src - stride*3 + 8; 620 const int l5= (int)tmp - (int)src - stride*3 + 8;
621 const int l6= stride*3 + l3; 621 const int l6= stride*3 + l3;
622 const int l7= stride + l6; 622 const int l7= stride + l6;
623 const int l8= stride + l7; 623 const int l8= stride + l7;
624 624
625 memcpy(tmp, src+stride*7, 8); 625 memcpy(tmp, src+stride*7, 8);
626 memcpy(tmp+8, src+stride*8, 8); 626 memcpy(tmp+8, src+stride*8, 8);
627 */ 627 */
628 src+= stride*4; 628 src+= stride*4;
629 asm volatile( 629 asm volatile(
630 630
631 #if 0 //sligtly more accurate and slightly slower 631 #if 0 //sligtly more accurate and slightly slower
632 "pxor %%mm7, %%mm7 \n\t" // 0 632 "pxor %%mm7, %%mm7 \n\t" // 0
633 "lea (%0, %1), %%"REG_a" \n\t" 633 "lea (%0, %1), %%"REG_a" \n\t"
634 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 634 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
635 // 0 1 2 3 4 5 6 7 635 // 0 1 2 3 4 5 6 7
636 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 636 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
637 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 637 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
638 638
639 639
640 "movq (%0, %1, 2), %%mm0 \n\t" // l2 640 "movq (%0, %1, 2), %%mm0 \n\t" // l2
641 "movq (%0), %%mm1 \n\t" // l0 641 "movq (%0), %%mm1 \n\t" // l0
642 "movq %%mm0, %%mm2 \n\t" // l2 642 "movq %%mm0, %%mm2 \n\t" // l2
643 PAVGB(%%mm7, %%mm0) // ~l2/2 643 PAVGB(%%mm7, %%mm0) // ~l2/2
644 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 644 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
645 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 645 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
646 646
647 "movq (%%"REG_a"), %%mm1 \n\t" // l1 647 "movq (%%"REG_a"), %%mm1 \n\t" // l1
648 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 648 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3
649 "movq %%mm1, %%mm4 \n\t" // l1 649 "movq %%mm1, %%mm4 \n\t" // l1
650 PAVGB(%%mm7, %%mm1) // ~l1/2 650 PAVGB(%%mm7, %%mm1) // ~l1/2
651 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 651 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
652 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 652 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
653 653
654 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 654 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
655 "psubusb %%mm1, %%mm0 \n\t" 655 "psubusb %%mm1, %%mm0 \n\t"
656 "psubusb %%mm4, %%mm1 \n\t" 656 "psubusb %%mm4, %%mm1 \n\t"
657 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 657 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
658 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 658 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
659 659
660 "movq (%0, %1, 4), %%mm0 \n\t" // l4 660 "movq (%0, %1, 4), %%mm0 \n\t" // l4
661 "movq %%mm0, %%mm4 \n\t" // l4 661 "movq %%mm0, %%mm4 \n\t" // l4
662 PAVGB(%%mm7, %%mm0) // ~l4/2 662 PAVGB(%%mm7, %%mm0) // ~l4/2
663 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 663 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
664 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 664 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
665 665
666 "movq (%%"REG_c"), %%mm2 \n\t" // l5 666 "movq (%%"REG_c"), %%mm2 \n\t" // l5
667 "movq %%mm3, %%mm5 \n\t" // l3 667 "movq %%mm3, %%mm5 \n\t" // l3
668 PAVGB(%%mm7, %%mm3) // ~l3/2 668 PAVGB(%%mm7, %%mm3) // ~l3/2
669 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 669 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
670 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 670 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
671 671
672 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 672 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
673 "psubusb %%mm3, %%mm0 \n\t" 673 "psubusb %%mm3, %%mm0 \n\t"
674 "psubusb %%mm6, %%mm3 \n\t" 674 "psubusb %%mm6, %%mm3 \n\t"
675 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 675 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
676 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) 676 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
677 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 677 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
678 678
679 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 679 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6
680 "movq %%mm6, %%mm5 \n\t" // l6 680 "movq %%mm6, %%mm5 \n\t" // l6
681 PAVGB(%%mm7, %%mm6) // ~l6/2 681 PAVGB(%%mm7, %%mm6) // ~l6/2
682 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 682 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
683 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 683 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
684 684
685 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 685 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7
686 "movq %%mm2, %%mm4 \n\t" // l5 686 "movq %%mm2, %%mm4 \n\t" // l5
687 PAVGB(%%mm7, %%mm2) // ~l5/2 687 PAVGB(%%mm7, %%mm2) // ~l5/2
688 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 688 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
689 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 689 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
690 690
691 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 691 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
692 "psubusb %%mm2, %%mm6 \n\t" 692 "psubusb %%mm2, %%mm6 \n\t"
693 "psubusb %%mm4, %%mm2 \n\t" 693 "psubusb %%mm4, %%mm2 \n\t"
694 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 694 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
695 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 695 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
696 696
697 697
698 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 698 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
699 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? 699 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
700 "paddusb "MANGLE(b01)", %%mm4 \n\t" 700 "paddusb "MANGLE(b01)", %%mm4 \n\t"
701 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP 701 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
702 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 702 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
703 "pand %%mm4, %%mm3 \n\t" 703 "pand %%mm4, %%mm3 \n\t"
704 704
705 "movq %%mm3, %%mm1 \n\t" 705 "movq %%mm3, %%mm1 \n\t"
706 // "psubusb "MANGLE(b01)", %%mm3 \n\t" 706 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
707 PAVGB(%%mm7, %%mm3) 707 PAVGB(%%mm7, %%mm3)
708 PAVGB(%%mm7, %%mm3) 708 PAVGB(%%mm7, %%mm3)
709 "paddusb %%mm1, %%mm3 \n\t" 709 "paddusb %%mm1, %%mm3 \n\t"
710 // "paddusb "MANGLE(b01)", %%mm3 \n\t" 710 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
711 711
712 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 712 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3
713 "movq (%0, %1, 4), %%mm5 \n\t" //l4 713 "movq (%0, %1, 4), %%mm5 \n\t" //l4
714 "movq (%0, %1, 4), %%mm4 \n\t" //l4 714 "movq (%0, %1, 4), %%mm4 \n\t" //l4
715 "psubusb %%mm6, %%mm5 \n\t" 715 "psubusb %%mm6, %%mm5 \n\t"
716 "psubusb %%mm4, %%mm6 \n\t" 716 "psubusb %%mm4, %%mm6 \n\t"
717 "por %%mm6, %%mm5 \n\t" // |l3-l4| 717 "por %%mm6, %%mm5 \n\t" // |l3-l4|
718 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) 718 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
719 "pxor %%mm6, %%mm0 \n\t" 719 "pxor %%mm6, %%mm0 \n\t"
720 "pand %%mm0, %%mm3 \n\t" 720 "pand %%mm0, %%mm3 \n\t"
721 PMINUB(%%mm5, %%mm3, %%mm0) 721 PMINUB(%%mm5, %%mm3, %%mm0)
722 722
723 "psubusb "MANGLE(b01)", %%mm3 \n\t" 723 "psubusb "MANGLE(b01)", %%mm3 \n\t"
724 PAVGB(%%mm7, %%mm3) 724 PAVGB(%%mm7, %%mm3)
725 725
726 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 726 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
727 "movq (%0, %1, 4), %%mm2 \n\t" 727 "movq (%0, %1, 4), %%mm2 \n\t"
728 "pxor %%mm6, %%mm0 \n\t" 728 "pxor %%mm6, %%mm0 \n\t"
729 "pxor %%mm6, %%mm2 \n\t" 729 "pxor %%mm6, %%mm2 \n\t"
730 "psubb %%mm3, %%mm0 \n\t" 730 "psubb %%mm3, %%mm0 \n\t"
731 "paddb %%mm3, %%mm2 \n\t" 731 "paddb %%mm3, %%mm2 \n\t"
732 "pxor %%mm6, %%mm0 \n\t" 732 "pxor %%mm6, %%mm0 \n\t"
733 "pxor %%mm6, %%mm2 \n\t" 733 "pxor %%mm6, %%mm2 \n\t"
734 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 734 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
735 "movq %%mm2, (%0, %1, 4) \n\t" 735 "movq %%mm2, (%0, %1, 4) \n\t"
736 #endif //0 736 #endif //0
737 737
738 "lea (%0, %1), %%"REG_a" \n\t" 738 "lea (%0, %1), %%"REG_a" \n\t"
739 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 739 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
740 // 0 1 2 3 4 5 6 7 740 // 0 1 2 3 4 5 6 7
741 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 741 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
742 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 742 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
743 743
744 744
745 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 745 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3
746 "movq (%0, %1, 4), %%mm0 \n\t" // l4 746 "movq (%0, %1, 4), %%mm0 \n\t" // l4
747 "pxor %%mm6, %%mm1 \n\t" // -l3-1 747 "pxor %%mm6, %%mm1 \n\t" // -l3-1
748 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 748 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
749 // mm1=-l3-1, mm0=128-q 749 // mm1=-l3-1, mm0=128-q
750 750
751 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 751 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5
752 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 752 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2
753 "pxor %%mm6, %%mm2 \n\t" // -l5-1 753 "pxor %%mm6, %%mm2 \n\t" // -l5-1
754 "movq %%mm2, %%mm5 \n\t" // -l5-1 754 "movq %%mm2, %%mm5 \n\t" // -l5-1
755 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 755 "movq "MANGLE(b80)", %%mm4 \n\t" // 128
756 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 756 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
757 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 757 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
758 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 758 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
759 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 759 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
760 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 760 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
761 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 761 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
762 762
763 "movq (%%"REG_a"), %%mm2 \n\t" // l1 763 "movq (%%"REG_a"), %%mm2 \n\t" // l1
764 "pxor %%mm6, %%mm2 \n\t" // -l1-1 764 "pxor %%mm6, %%mm2 \n\t" // -l1-1
765 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 765 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
766 PAVGB((%0), %%mm1) // (l0-l3+256)/2 766 PAVGB((%0), %%mm1) // (l0-l3+256)/2
767 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 767 "movq "MANGLE(b80)", %%mm3 \n\t" // 128
768 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 768 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
769 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 769 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
770 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 770 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
771 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 771 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
772 772
773 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 773 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2
774 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 774 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7
775 "pxor %%mm6, %%mm1 \n\t" // -l7-1 775 "pxor %%mm6, %%mm1 \n\t" // -l7-1
776 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 776 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
777 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 777 "movq "MANGLE(b80)", %%mm2 \n\t" // 128
778 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 778 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
779 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 779 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
780 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 780 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
781 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 781 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
782 782
783 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 783 "movq "MANGLE(b00)", %%mm1 \n\t" // 0
784 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 784 "movq "MANGLE(b00)", %%mm5 \n\t" // 0
785 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 785 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
786 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 786 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
787 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| 787 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
788 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| 788 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
789 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 789 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
790 790
791 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 791 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
792 792
793 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 793 "movq "MANGLE(b00)", %%mm7 \n\t" // 0
794 "movq %2, %%mm2 \n\t" // QP 794 "movq %2, %%mm2 \n\t" // QP
795 PAVGB(%%mm6, %%mm2) // 128 + QP/2 795 PAVGB(%%mm6, %%mm2) // 128 + QP/2
796 "psubb %%mm6, %%mm2 \n\t" 796 "psubb %%mm6, %%mm2 \n\t"
797 797
798 "movq %%mm4, %%mm1 \n\t" 798 "movq %%mm4, %%mm1 \n\t"
799 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 799 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
800 "pxor %%mm1, %%mm4 \n\t" 800 "pxor %%mm1, %%mm4 \n\t"
801 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 801 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
802 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 802 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
803 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 803 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
804 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 804 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
805 805
806 "movq %%mm4, %%mm3 \n\t" // d 806 "movq %%mm4, %%mm3 \n\t" // d
807 "psubusb "MANGLE(b01)", %%mm4 \n\t" 807 "psubusb "MANGLE(b01)", %%mm4 \n\t"
808 PAVGB(%%mm7, %%mm4) // d/32 808 PAVGB(%%mm7, %%mm4) // d/32
809 PAVGB(%%mm7, %%mm4) // (d + 32)/64 809 PAVGB(%%mm7, %%mm4) // (d + 32)/64
810 "paddb %%mm3, %%mm4 \n\t" // 5d/64 810 "paddb %%mm3, %%mm4 \n\t" // 5d/64
811 "pand %%mm2, %%mm4 \n\t" 811 "pand %%mm2, %%mm4 \n\t"
812 812
813 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 813 "movq "MANGLE(b80)", %%mm5 \n\t" // 128
814 "psubb %%mm0, %%mm5 \n\t" // q 814 "psubb %%mm0, %%mm5 \n\t" // q
815 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding 815 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
816 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) 816 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
817 "pxor %%mm7, %%mm5 \n\t" 817 "pxor %%mm7, %%mm5 \n\t"
818 818
819 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) 819 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
820 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) 820 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
821 821
822 "pand %%mm7, %%mm4 \n\t" 822 "pand %%mm7, %%mm4 \n\t"
823 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 823 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
824 "movq (%0, %1, 4), %%mm2 \n\t" 824 "movq (%0, %1, 4), %%mm2 \n\t"
825 "pxor %%mm1, %%mm0 \n\t" 825 "pxor %%mm1, %%mm0 \n\t"
826 "pxor %%mm1, %%mm2 \n\t" 826 "pxor %%mm1, %%mm2 \n\t"
827 "paddb %%mm4, %%mm0 \n\t" 827 "paddb %%mm4, %%mm0 \n\t"
828 "psubb %%mm4, %%mm2 \n\t" 828 "psubb %%mm4, %%mm2 \n\t"
829 "pxor %%mm1, %%mm0 \n\t" 829 "pxor %%mm1, %%mm0 \n\t"
830 "pxor %%mm1, %%mm2 \n\t" 830 "pxor %%mm1, %%mm2 \n\t"
831 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 831 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
832 "movq %%mm2, (%0, %1, 4) \n\t" 832 "movq %%mm2, (%0, %1, 4) \n\t"
833 833
834 : 834 :
835 : "r" (src), "r" ((long)stride), "m" (c->pQPb) 835 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
836 : "%"REG_a, "%"REG_c 836 : "%"REG_a, "%"REG_c
837 ); 837 );
838 838
839 /* 839 /*
840 { 840 {
841 int x; 841 int x;
842 src-= stride; 842 src-= stride;
843 for(x=0; x<BLOCK_SIZE; x++) 843 for(x=0; x<BLOCK_SIZE; x++)
844 { 844 {
845 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 845 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
846 if(ABS(middleEnergy)< 8*QP) 846 if(ABS(middleEnergy)< 8*QP)
847 { 847 {
848 const int q=(src[l4] - src[l5])/2; 848 const int q=(src[l4] - src[l5])/2;
849 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 849 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
850 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 850 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
851 851
852 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); 852 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
853 d= MAX(d, 0); 853 d= MAX(d, 0);
854 854
855 d= (5*d + 32) >> 6; 855 d= (5*d + 32) >> 6;
856 d*= SIGN(-middleEnergy); 856 d*= SIGN(-middleEnergy);
857 857
858 if(q>0) 858 if(q>0)
859 { 859 {
860 d= d<0 ? 0 : d; 860 d= d<0 ? 0 : d;
861 d= d>q ? q : d; 861 d= d>q ? q : d;
862 } 862 }
863 else 863 else
864 { 864 {
865 d= d>0 ? 0 : d; 865 d= d>0 ? 0 : d;
866 d= d<q ? q : d; 866 d= d<q ? q : d;
867 } 867 }
868 868
869 src[l4]-= d; 869 src[l4]-= d;
870 src[l5]+= d; 870 src[l5]+= d;
871 } 871 }
872 src++; 872 src++;
873 } 873 }
874 src-=8; 874 src-=8;
875 for(x=0; x<8; x++) 875 for(x=0; x<8; x++)
876 { 876 {
877 int y; 877 int y;
878 for(y=4; y<6; y++) 878 for(y=4; y<6; y++)
879 { 879 {
880 int d= src[x+y*stride] - tmp[x+(y-4)*8]; 880 int d= src[x+y*stride] - tmp[x+(y-4)*8];
881 int ad= ABS(d); 881 int ad= ABS(d);
882 static int max=0; 882 static int max=0;
883 static int sum=0; 883 static int sum=0;
884 static int num=0; 884 static int num=0;
885 static int bias=0; 885 static int bias=0;
886 886
887 if(max<ad) max=ad; 887 if(max<ad) max=ad;
888 sum+= ad>3 ? 1 : 0; 888 sum+= ad>3 ? 1 : 0;
889 if(ad>3) 889 if(ad>3)
890 { 890 {
891 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; 891 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
892 } 892 }
893 if(y==4) bias+=d; 893 if(y==4) bias+=d;
894 num++; 894 num++;
895 if(num%1000000 == 0) 895 if(num%1000000 == 0)
896 { 896 {
897 printf(" %d %d %d %d\n", num, sum, max, bias); 897 printf(" %d %d %d %d\n", num, sum, max, bias);
898 } 898 }
899 } 899 }
900 } 900 }
901 } 901 }
902 */ 902 */
903 #elif defined (HAVE_MMX) 903 #elif defined (HAVE_MMX)
904 src+= stride*4; 904 src+= stride*4;
905 asm volatile( 905 asm volatile(
906 "pxor %%mm7, %%mm7 \n\t" 906 "pxor %%mm7, %%mm7 \n\t"
907 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 907 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
908 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 908 "and "ALIGN_MASK", %%"REG_c" \n\t" // align
909 // 0 1 2 3 4 5 6 7 909 // 0 1 2 3 4 5 6 7
910 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 910 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
911 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 911 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
912 912
913 "movq (%0), %%mm0 \n\t" 913 "movq (%0), %%mm0 \n\t"
914 "movq %%mm0, %%mm1 \n\t" 914 "movq %%mm0, %%mm1 \n\t"
915 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 915 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
916 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 916 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
917 917
918 "movq (%0, %1), %%mm2 \n\t" 918 "movq (%0, %1), %%mm2 \n\t"
919 "lea (%0, %1, 2), %%"REG_a" \n\t" 919 "lea (%0, %1, 2), %%"REG_a" \n\t"
920 "movq %%mm2, %%mm3 \n\t" 920 "movq %%mm2, %%mm3 \n\t"
921 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 921 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
922 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 922 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
923 923
924 "movq (%%"REG_a"), %%mm4 \n\t" 924 "movq (%%"REG_a"), %%mm4 \n\t"
925 "movq %%mm4, %%mm5 \n\t" 925 "movq %%mm4, %%mm5 \n\t"
926 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 926 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
927 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 927 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
928 928
929 "paddw %%mm0, %%mm0 \n\t" // 2L0 929 "paddw %%mm0, %%mm0 \n\t" // 2L0
930 "paddw %%mm1, %%mm1 \n\t" // 2H0 930 "paddw %%mm1, %%mm1 \n\t" // 2H0
931 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 931 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
932 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 932 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
933 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 933 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
934 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 934 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
935 935
936 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 936 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
937 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 937 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
940 940
941 "movq (%%"REG_a", %1), %%mm2 \n\t" 941 "movq (%%"REG_a", %1), %%mm2 \n\t"
942 "movq %%mm2, %%mm3 \n\t" 942 "movq %%mm2, %%mm3 \n\t"
943 "punpcklbw %%mm7, %%mm2 \n\t" // L3 943 "punpcklbw %%mm7, %%mm2 \n\t" // L3
944 "punpckhbw %%mm7, %%mm3 \n\t" // H3 944 "punpckhbw %%mm7, %%mm3 \n\t" // H3
945 945
946 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 946 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
947 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 947 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
948 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 948 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
949 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 949 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
950 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 950 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
951 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 951 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
952 952
953 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 953 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
954 "movq %%mm0, %%mm1 \n\t" 954 "movq %%mm0, %%mm1 \n\t"
955 "punpcklbw %%mm7, %%mm0 \n\t" // L4 955 "punpcklbw %%mm7, %%mm0 \n\t" // L4
956 "punpckhbw %%mm7, %%mm1 \n\t" // H4 956 "punpckhbw %%mm7, %%mm1 \n\t" // H4
957 957
958 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 958 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
959 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 959 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
960 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 960 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
961 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 961 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
962 "paddw %%mm4, %%mm4 \n\t" // 2L2 962 "paddw %%mm4, %%mm4 \n\t" // 2L2
963 "paddw %%mm5, %%mm5 \n\t" // 2H2 963 "paddw %%mm5, %%mm5 \n\t" // 2H2
964 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 964 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
965 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 965 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
966 966
967 "lea (%%"REG_a", %1), %0 \n\t" 967 "lea (%%"REG_a", %1), %0 \n\t"
968 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 968 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
969 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 969 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
972 //50 opcodes so far 972 //50 opcodes so far
973 "movq (%0, %1, 2), %%mm2 \n\t" 973 "movq (%0, %1, 2), %%mm2 \n\t"
974 "movq %%mm2, %%mm3 \n\t" 974 "movq %%mm2, %%mm3 \n\t"
975 "punpcklbw %%mm7, %%mm2 \n\t" // L5 975 "punpcklbw %%mm7, %%mm2 \n\t" // L5
976 "punpckhbw %%mm7, %%mm3 \n\t" // H5 976 "punpckhbw %%mm7, %%mm3 \n\t" // H5
977 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 977 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
978 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 978 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
979 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 979 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
980 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 980 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
981 981
982 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 982 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
983 "punpcklbw %%mm7, %%mm6 \n\t" // L6 983 "punpcklbw %%mm7, %%mm6 \n\t" // L6
984 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 984 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
985 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 985 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
986 "punpckhbw %%mm7, %%mm6 \n\t" // H6 986 "punpckhbw %%mm7, %%mm6 \n\t" // H6
987 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 987 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
988 988
989 "paddw %%mm0, %%mm0 \n\t" // 2L4 989 "paddw %%mm0, %%mm0 \n\t" // 2L4
990 "paddw %%mm1, %%mm1 \n\t" // 2H4 990 "paddw %%mm1, %%mm1 \n\t" // 2H4
991 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 991 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
992 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 992 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
993 993
994 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 994 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
995 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 995 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
996 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 996 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
997 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 997 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
998 998
999 "movq (%0, %1, 4), %%mm2 \n\t" 999 "movq (%0, %1, 4), %%mm2 \n\t"
1000 "movq %%mm2, %%mm3 \n\t" 1000 "movq %%mm2, %%mm3 \n\t"
1001 "punpcklbw %%mm7, %%mm2 \n\t" // L7 1001 "punpcklbw %%mm7, %%mm2 \n\t" // L7
1002 "punpckhbw %%mm7, %%mm3 \n\t" // H7 1002 "punpckhbw %%mm7, %%mm3 \n\t" // H7
1003 1003
1004 "paddw %%mm2, %%mm2 \n\t" // 2L7 1004 "paddw %%mm2, %%mm2 \n\t" // 2L7
1005 "paddw %%mm3, %%mm3 \n\t" // 2H7 1005 "paddw %%mm3, %%mm3 \n\t" // 2H7
1006 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 1006 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1007 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 1007 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1008 1008
1009 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 1009 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1010 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 1010 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1011 1011
1012 #ifdef HAVE_MMX2 1012 #ifdef HAVE_MMX2
1013 "movq %%mm7, %%mm6 \n\t" // 0 1013 "movq %%mm7, %%mm6 \n\t" // 0
1014 "psubw %%mm0, %%mm6 \n\t" 1014 "psubw %%mm0, %%mm6 \n\t"
1015 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 1015 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1016 "movq %%mm7, %%mm6 \n\t" // 0 1016 "movq %%mm7, %%mm6 \n\t" // 0
1017 "psubw %%mm1, %%mm6 \n\t" 1017 "psubw %%mm1, %%mm6 \n\t"
1018 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 1018 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1019 "movq %%mm7, %%mm6 \n\t" // 0 1019 "movq %%mm7, %%mm6 \n\t" // 0
1020 "psubw %%mm2, %%mm6 \n\t" 1020 "psubw %%mm2, %%mm6 \n\t"
1021 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 1021 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1022 "movq %%mm7, %%mm6 \n\t" // 0 1022 "movq %%mm7, %%mm6 \n\t" // 0
1023 "psubw %%mm3, %%mm6 \n\t" 1023 "psubw %%mm3, %%mm6 \n\t"
1024 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 1024 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1025 #else 1025 #else
1026 "movq %%mm7, %%mm6 \n\t" // 0 1026 "movq %%mm7, %%mm6 \n\t" // 0
1027 "pcmpgtw %%mm0, %%mm6 \n\t" 1027 "pcmpgtw %%mm0, %%mm6 \n\t"
1028 "pxor %%mm6, %%mm0 \n\t" 1028 "pxor %%mm6, %%mm0 \n\t"
1029 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 1029 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1030 "movq %%mm7, %%mm6 \n\t" // 0 1030 "movq %%mm7, %%mm6 \n\t" // 0
1031 "pcmpgtw %%mm1, %%mm6 \n\t" 1031 "pcmpgtw %%mm1, %%mm6 \n\t"
1032 "pxor %%mm6, %%mm1 \n\t" 1032 "pxor %%mm6, %%mm1 \n\t"
1033 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 1033 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1034 "movq %%mm7, %%mm6 \n\t" // 0 1034 "movq %%mm7, %%mm6 \n\t" // 0
1035 "pcmpgtw %%mm2, %%mm6 \n\t" 1035 "pcmpgtw %%mm2, %%mm6 \n\t"
1036 "pxor %%mm6, %%mm2 \n\t" 1036 "pxor %%mm6, %%mm2 \n\t"
1037 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 1037 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1038 "movq %%mm7, %%mm6 \n\t" // 0 1038 "movq %%mm7, %%mm6 \n\t" // 0
1039 "pcmpgtw %%mm3, %%mm6 \n\t" 1039 "pcmpgtw %%mm3, %%mm6 \n\t"
1040 "pxor %%mm6, %%mm3 \n\t" 1040 "pxor %%mm6, %%mm3 \n\t"
1041 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 1041 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1042 #endif 1042 #endif
1043 1043
1044 #ifdef HAVE_MMX2 1044 #ifdef HAVE_MMX2
1045 "pminsw %%mm2, %%mm0 \n\t" 1045 "pminsw %%mm2, %%mm0 \n\t"
1046 "pminsw %%mm3, %%mm1 \n\t" 1046 "pminsw %%mm3, %%mm1 \n\t"
1047 #else 1047 #else
1048 "movq %%mm0, %%mm6 \n\t" 1048 "movq %%mm0, %%mm6 \n\t"
1049 "psubusw %%mm2, %%mm6 \n\t" 1049 "psubusw %%mm2, %%mm6 \n\t"
1050 "psubw %%mm6, %%mm0 \n\t" 1050 "psubw %%mm6, %%mm0 \n\t"
1051 "movq %%mm1, %%mm6 \n\t" 1051 "movq %%mm1, %%mm6 \n\t"
1052 "psubusw %%mm3, %%mm6 \n\t" 1052 "psubusw %%mm3, %%mm6 \n\t"
1053 "psubw %%mm6, %%mm1 \n\t" 1053 "psubw %%mm6, %%mm1 \n\t"
1054 #endif 1054 #endif
1055 1055
1056 "movd %2, %%mm2 \n\t" // QP 1056 "movd %2, %%mm2 \n\t" // QP
1057 "punpcklbw %%mm7, %%mm2 \n\t" 1057 "punpcklbw %%mm7, %%mm2 \n\t"
1058 1058
1059 "movq %%mm7, %%mm6 \n\t" // 0 1059 "movq %%mm7, %%mm6 \n\t" // 0
1060 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 1060 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1061 "pxor %%mm6, %%mm4 \n\t" 1061 "pxor %%mm6, %%mm4 \n\t"
1062 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 1062 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1063 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 1063 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1064 "pxor %%mm7, %%mm5 \n\t" 1064 "pxor %%mm7, %%mm5 \n\t"
1065 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 1065 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1066 // 100 opcodes 1066 // 100 opcodes
1067 "psllw $3, %%mm2 \n\t" // 8QP 1067 "psllw $3, %%mm2 \n\t" // 8QP
1068 "movq %%mm2, %%mm3 \n\t" // 8QP 1068 "movq %%mm2, %%mm3 \n\t" // 8QP
1069 "pcmpgtw %%mm4, %%mm2 \n\t" 1069 "pcmpgtw %%mm4, %%mm2 \n\t"
1070 "pcmpgtw %%mm5, %%mm3 \n\t" 1070 "pcmpgtw %%mm5, %%mm3 \n\t"
1071 "pand %%mm2, %%mm4 \n\t" 1071 "pand %%mm2, %%mm4 \n\t"
1072 "pand %%mm3, %%mm5 \n\t" 1072 "pand %%mm3, %%mm5 \n\t"
1073 1073
1074 1074
1075 "psubusw %%mm0, %%mm4 \n\t" // hd 1075 "psubusw %%mm0, %%mm4 \n\t" // hd
1076 "psubusw %%mm1, %%mm5 \n\t" // ld 1076 "psubusw %%mm1, %%mm5 \n\t" // ld
1077 1077
1078 1078
1079 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 1079 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
1080 "pmullw %%mm2, %%mm4 \n\t" 1080 "pmullw %%mm2, %%mm4 \n\t"
1081 "pmullw %%mm2, %%mm5 \n\t" 1081 "pmullw %%mm2, %%mm5 \n\t"
1082 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 1082 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
1083 "paddw %%mm2, %%mm4 \n\t" 1083 "paddw %%mm2, %%mm4 \n\t"
1084 "paddw %%mm2, %%mm5 \n\t" 1084 "paddw %%mm2, %%mm5 \n\t"
1085 "psrlw $6, %%mm4 \n\t" 1085 "psrlw $6, %%mm4 \n\t"
1086 "psrlw $6, %%mm5 \n\t" 1086 "psrlw $6, %%mm5 \n\t"
1087 1087
1088 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 1088 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
1089 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 1089 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
1090 1090
1091 "pxor %%mm2, %%mm2 \n\t" 1091 "pxor %%mm2, %%mm2 \n\t"
1092 "pxor %%mm3, %%mm3 \n\t" 1092 "pxor %%mm3, %%mm3 \n\t"
1093 1093
1094 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 1094 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1095 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 1095 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1096 "pxor %%mm2, %%mm0 \n\t" 1096 "pxor %%mm2, %%mm0 \n\t"
1097 "pxor %%mm3, %%mm1 \n\t" 1097 "pxor %%mm3, %%mm1 \n\t"
1098 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 1098 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1099 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 1099 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1100 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 1100 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1101 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 1101 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1102 1102
1103 "pxor %%mm6, %%mm2 \n\t" 1103 "pxor %%mm6, %%mm2 \n\t"
1104 "pxor %%mm7, %%mm3 \n\t" 1104 "pxor %%mm7, %%mm3 \n\t"
1105 "pand %%mm2, %%mm4 \n\t" 1105 "pand %%mm2, %%mm4 \n\t"
1106 "pand %%mm3, %%mm5 \n\t" 1106 "pand %%mm3, %%mm5 \n\t"
1107 1107
1108 #ifdef HAVE_MMX2 1108 #ifdef HAVE_MMX2
1109 "pminsw %%mm0, %%mm4 \n\t" 1109 "pminsw %%mm0, %%mm4 \n\t"
1110 "pminsw %%mm1, %%mm5 \n\t" 1110 "pminsw %%mm1, %%mm5 \n\t"
1111 #else 1111 #else
1112 "movq %%mm4, %%mm2 \n\t" 1112 "movq %%mm4, %%mm2 \n\t"
1113 "psubusw %%mm0, %%mm2 \n\t" 1113 "psubusw %%mm0, %%mm2 \n\t"
1114 "psubw %%mm2, %%mm4 \n\t" 1114 "psubw %%mm2, %%mm4 \n\t"
1115 "movq %%mm5, %%mm2 \n\t" 1115 "movq %%mm5, %%mm2 \n\t"
1116 "psubusw %%mm1, %%mm2 \n\t" 1116 "psubusw %%mm1, %%mm2 \n\t"
1117 "psubw %%mm2, %%mm5 \n\t" 1117 "psubw %%mm2, %%mm5 \n\t"
1118 #endif 1118 #endif
1119 "pxor %%mm6, %%mm4 \n\t" 1119 "pxor %%mm6, %%mm4 \n\t"
1120 "pxor %%mm7, %%mm5 \n\t" 1120 "pxor %%mm7, %%mm5 \n\t"
1121 "psubw %%mm6, %%mm4 \n\t" 1121 "psubw %%mm6, %%mm4 \n\t"
1122 "psubw %%mm7, %%mm5 \n\t" 1122 "psubw %%mm7, %%mm5 \n\t"
1123 "packsswb %%mm5, %%mm4 \n\t" 1123 "packsswb %%mm5, %%mm4 \n\t"
1124 "movq (%0), %%mm0 \n\t" 1124 "movq (%0), %%mm0 \n\t"
1125 "paddb %%mm4, %%mm0 \n\t" 1125 "paddb %%mm4, %%mm0 \n\t"
1126 "movq %%mm0, (%0) \n\t" 1126 "movq %%mm0, (%0) \n\t"
1127 "movq (%0, %1), %%mm0 \n\t" 1127 "movq (%0, %1), %%mm0 \n\t"
1128 "psubb %%mm4, %%mm0 \n\t" 1128 "psubb %%mm4, %%mm0 \n\t"
1129 "movq %%mm0, (%0, %1) \n\t" 1129 "movq %%mm0, (%0, %1) \n\t"
1130 1130
1131 : "+r" (src) 1131 : "+r" (src)
1132 : "r" ((long)stride), "m" (c->pQPb) 1132 : "r" ((long)stride), "m" (c->pQPb)
1133 : "%"REG_a, "%"REG_c 1133 : "%"REG_a, "%"REG_c
1134 ); 1134 );
1135 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1135 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1136 const int l1= stride; 1136 const int l1= stride;
1137 const int l2= stride + l1; 1137 const int l2= stride + l1;
1138 const int l3= stride + l2; 1138 const int l3= stride + l2;
1139 const int l4= stride + l3; 1139 const int l4= stride + l3;
1140 const int l5= stride + l4; 1140 const int l5= stride + l4;
1141 const int l6= stride + l5; 1141 const int l6= stride + l5;
1142 const int l7= stride + l6; 1142 const int l7= stride + l6;
1143 const int l8= stride + l7; 1143 const int l8= stride + l7;
1144 // const int l9= stride + l8; 1144 // const int l9= stride + l8;
1145 int x; 1145 int x;
1146 src+= stride*3; 1146 src+= stride*3;
1147 for(x=0; x<BLOCK_SIZE; x++) 1147 for(x=0; x<BLOCK_SIZE; x++)
1148 { 1148 {
1149 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1149 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1150 if(ABS(middleEnergy) < 8*c->QP) 1150 if(ABS(middleEnergy) < 8*c->QP)
1151 { 1151 {
1152 const int q=(src[l4] - src[l5])/2; 1152 const int q=(src[l4] - src[l5])/2;
1153 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 1153 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1154 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 1154 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1155 1155
1156 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); 1156 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1157 d= MAX(d, 0); 1157 d= MAX(d, 0);
1158 1158
1159 d= (5*d + 32) >> 6; 1159 d= (5*d + 32) >> 6;
1160 d*= SIGN(-middleEnergy); 1160 d*= SIGN(-middleEnergy);
1161 1161
1162 if(q>0) 1162 if(q>0)
1163 { 1163 {
1164 d= d<0 ? 0 : d; 1164 d= d<0 ? 0 : d;
1165 d= d>q ? q : d; 1165 d= d>q ? q : d;
1166 } 1166 }
1167 else 1167 else
1168 { 1168 {
1169 d= d>0 ? 0 : d; 1169 d= d>0 ? 0 : d;
1170 d= d<q ? q : d; 1170 d= d<q ? q : d;
1171 } 1171 }
1172 1172
1173 src[l4]-= d; 1173 src[l4]-= d;
1174 src[l5]+= d; 1174 src[l5]+= d;
1175 } 1175 }
1176 src++; 1176 src++;
1177 } 1177 }
1178 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1178 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1179 } 1179 }
1180 #endif //HAVE_ALTIVEC 1180 #endif //HAVE_ALTIVEC
1181 1181
1182 #ifndef HAVE_ALTIVEC 1182 #ifndef HAVE_ALTIVEC
1183 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 1183 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1184 { 1184 {
1185 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1185 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1186 asm volatile( 1186 asm volatile(
1187 "pxor %%mm6, %%mm6 \n\t" 1187 "pxor %%mm6, %%mm6 \n\t"
1188 "pcmpeqb %%mm7, %%mm7 \n\t" 1188 "pcmpeqb %%mm7, %%mm7 \n\t"
1189 "movq %2, %%mm0 \n\t" 1189 "movq %2, %%mm0 \n\t"
1190 "punpcklbw %%mm6, %%mm0 \n\t" 1190 "punpcklbw %%mm6, %%mm0 \n\t"
1191 "psrlw $1, %%mm0 \n\t" 1191 "psrlw $1, %%mm0 \n\t"
1192 "psubw %%mm7, %%mm0 \n\t" 1192 "psubw %%mm7, %%mm0 \n\t"
1193 "packuswb %%mm0, %%mm0 \n\t" 1193 "packuswb %%mm0, %%mm0 \n\t"
1194 "movq %%mm0, %3 \n\t" 1194 "movq %%mm0, %3 \n\t"
1195 1195
1196 "lea (%0, %1), %%"REG_a" \n\t" 1196 "lea (%0, %1), %%"REG_a" \n\t"
1197 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1197 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1198 1198
1199 // 0 1 2 3 4 5 6 7 8 9 1199 // 0 1 2 3 4 5 6 7 8 9
1200 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1200 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1201 1201
1202 #undef FIND_MIN_MAX 1202 #undef FIND_MIN_MAX
1203 #ifdef HAVE_MMX2 1203 #ifdef HAVE_MMX2
1204 #define REAL_FIND_MIN_MAX(addr)\ 1204 #define REAL_FIND_MIN_MAX(addr)\
1205 "movq " #addr ", %%mm0 \n\t"\ 1205 "movq " #addr ", %%mm0 \n\t"\
1206 "pminub %%mm0, %%mm7 \n\t"\ 1206 "pminub %%mm0, %%mm7 \n\t"\
1207 "pmaxub %%mm0, %%mm6 \n\t" 1207 "pmaxub %%mm0, %%mm6 \n\t"
1208 #else 1208 #else
1209 #define REAL_FIND_MIN_MAX(addr)\ 1209 #define REAL_FIND_MIN_MAX(addr)\
1210 "movq " #addr ", %%mm0 \n\t"\ 1210 "movq " #addr ", %%mm0 \n\t"\
1211 "movq %%mm7, %%mm1 \n\t"\ 1211 "movq %%mm7, %%mm1 \n\t"\
1212 "psubusb %%mm0, %%mm6 \n\t"\ 1212 "psubusb %%mm0, %%mm6 \n\t"\
1213 "paddb %%mm0, %%mm6 \n\t"\ 1213 "paddb %%mm0, %%mm6 \n\t"\
1214 "psubusb %%mm0, %%mm1 \n\t"\ 1214 "psubusb %%mm0, %%mm1 \n\t"\
1215 "psubb %%mm1, %%mm7 \n\t" 1215 "psubb %%mm1, %%mm7 \n\t"
1216 #endif 1216 #endif
1217 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) 1217 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1218 1218
1219 FIND_MIN_MAX((%%REGa)) 1219 FIND_MIN_MAX((%%REGa))
1220 FIND_MIN_MAX((%%REGa, %1)) 1220 FIND_MIN_MAX((%%REGa, %1))
1223 FIND_MIN_MAX((%%REGd)) 1223 FIND_MIN_MAX((%%REGd))
1224 FIND_MIN_MAX((%%REGd, %1)) 1224 FIND_MIN_MAX((%%REGd, %1))
1225 FIND_MIN_MAX((%%REGd, %1, 2)) 1225 FIND_MIN_MAX((%%REGd, %1, 2))
1226 FIND_MIN_MAX((%0, %1, 8)) 1226 FIND_MIN_MAX((%0, %1, 8))
1227 1227
1228 "movq %%mm7, %%mm4 \n\t" 1228 "movq %%mm7, %%mm4 \n\t"
1229 "psrlq $8, %%mm7 \n\t" 1229 "psrlq $8, %%mm7 \n\t"
1230 #ifdef HAVE_MMX2 1230 #ifdef HAVE_MMX2
1231 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1231 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1232 "pshufw $0xF9, %%mm7, %%mm4 \n\t" 1232 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1233 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1233 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1234 "pshufw $0xFE, %%mm7, %%mm4 \n\t" 1234 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1235 "pminub %%mm4, %%mm7 \n\t" 1235 "pminub %%mm4, %%mm7 \n\t"
1236 #else 1236 #else
1237 "movq %%mm7, %%mm1 \n\t" 1237 "movq %%mm7, %%mm1 \n\t"
1238 "psubusb %%mm4, %%mm1 \n\t" 1238 "psubusb %%mm4, %%mm1 \n\t"
1239 "psubb %%mm1, %%mm7 \n\t" 1239 "psubb %%mm1, %%mm7 \n\t"
1240 "movq %%mm7, %%mm4 \n\t" 1240 "movq %%mm7, %%mm4 \n\t"
1241 "psrlq $16, %%mm7 \n\t" 1241 "psrlq $16, %%mm7 \n\t"
1242 "movq %%mm7, %%mm1 \n\t" 1242 "movq %%mm7, %%mm1 \n\t"
1243 "psubusb %%mm4, %%mm1 \n\t" 1243 "psubusb %%mm4, %%mm1 \n\t"
1244 "psubb %%mm1, %%mm7 \n\t" 1244 "psubb %%mm1, %%mm7 \n\t"
1245 "movq %%mm7, %%mm4 \n\t" 1245 "movq %%mm7, %%mm4 \n\t"
1246 "psrlq $32, %%mm7 \n\t" 1246 "psrlq $32, %%mm7 \n\t"
1247 "movq %%mm7, %%mm1 \n\t" 1247 "movq %%mm7, %%mm1 \n\t"
1248 "psubusb %%mm4, %%mm1 \n\t" 1248 "psubusb %%mm4, %%mm1 \n\t"
1249 "psubb %%mm1, %%mm7 \n\t" 1249 "psubb %%mm1, %%mm7 \n\t"
1250 #endif 1250 #endif
1251 1251
1252 1252
1253 "movq %%mm6, %%mm4 \n\t" 1253 "movq %%mm6, %%mm4 \n\t"
1254 "psrlq $8, %%mm6 \n\t" 1254 "psrlq $8, %%mm6 \n\t"
1255 #ifdef HAVE_MMX2 1255 #ifdef HAVE_MMX2
1256 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels 1256 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1257 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 1257 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1258 "pmaxub %%mm4, %%mm6 \n\t" 1258 "pmaxub %%mm4, %%mm6 \n\t"
1259 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 1259 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1260 "pmaxub %%mm4, %%mm6 \n\t" 1260 "pmaxub %%mm4, %%mm6 \n\t"
1261 #else 1261 #else
1262 "psubusb %%mm4, %%mm6 \n\t" 1262 "psubusb %%mm4, %%mm6 \n\t"
1263 "paddb %%mm4, %%mm6 \n\t" 1263 "paddb %%mm4, %%mm6 \n\t"
1264 "movq %%mm6, %%mm4 \n\t" 1264 "movq %%mm6, %%mm4 \n\t"
1265 "psrlq $16, %%mm6 \n\t" 1265 "psrlq $16, %%mm6 \n\t"
1266 "psubusb %%mm4, %%mm6 \n\t" 1266 "psubusb %%mm4, %%mm6 \n\t"
1267 "paddb %%mm4, %%mm6 \n\t" 1267 "paddb %%mm4, %%mm6 \n\t"
1268 "movq %%mm6, %%mm4 \n\t" 1268 "movq %%mm6, %%mm4 \n\t"
1269 "psrlq $32, %%mm6 \n\t" 1269 "psrlq $32, %%mm6 \n\t"
1270 "psubusb %%mm4, %%mm6 \n\t" 1270 "psubusb %%mm4, %%mm6 \n\t"
1271 "paddb %%mm4, %%mm6 \n\t" 1271 "paddb %%mm4, %%mm6 \n\t"
1272 #endif 1272 #endif
1273 "movq %%mm6, %%mm0 \n\t" // max 1273 "movq %%mm6, %%mm0 \n\t" // max
1274 "psubb %%mm7, %%mm6 \n\t" // max - min 1274 "psubb %%mm7, %%mm6 \n\t" // max - min
1275 "movd %%mm6, %%ecx \n\t" 1275 "movd %%mm6, %%ecx \n\t"
1276 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" 1276 "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
1277 " jb 1f \n\t" 1277 " jb 1f \n\t"
1278 "lea -24(%%"REG_SP"), %%"REG_c" \n\t" 1278 "lea -24(%%"REG_SP"), %%"REG_c" \n\t"
1279 "and "ALIGN_MASK", %%"REG_c" \n\t" 1279 "and "ALIGN_MASK", %%"REG_c" \n\t"
1280 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 1280 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1281 "punpcklbw %%mm7, %%mm7 \n\t" 1281 "punpcklbw %%mm7, %%mm7 \n\t"
1282 "punpcklbw %%mm7, %%mm7 \n\t" 1282 "punpcklbw %%mm7, %%mm7 \n\t"
1283 "punpcklbw %%mm7, %%mm7 \n\t" 1283 "punpcklbw %%mm7, %%mm7 \n\t"
1284 "movq %%mm7, (%%"REG_c") \n\t" 1284 "movq %%mm7, (%%"REG_c") \n\t"
1285 1285
1286 "movq (%0), %%mm0 \n\t" // L10 1286 "movq (%0), %%mm0 \n\t" // L10
1287 "movq %%mm0, %%mm1 \n\t" // L10 1287 "movq %%mm0, %%mm1 \n\t" // L10
1288 "movq %%mm0, %%mm2 \n\t" // L10 1288 "movq %%mm0, %%mm2 \n\t" // L10
1289 "psllq $8, %%mm1 \n\t" 1289 "psllq $8, %%mm1 \n\t"
1290 "psrlq $8, %%mm2 \n\t" 1290 "psrlq $8, %%mm2 \n\t"
1291 "movd -4(%0), %%mm3 \n\t" 1291 "movd -4(%0), %%mm3 \n\t"
1292 "movd 8(%0), %%mm4 \n\t" 1292 "movd 8(%0), %%mm4 \n\t"
1293 "psrlq $24, %%mm3 \n\t" 1293 "psrlq $24, %%mm3 \n\t"
1294 "psllq $56, %%mm4 \n\t" 1294 "psllq $56, %%mm4 \n\t"
1295 "por %%mm3, %%mm1 \n\t" // L00 1295 "por %%mm3, %%mm1 \n\t" // L00
1296 "por %%mm4, %%mm2 \n\t" // L20 1296 "por %%mm4, %%mm2 \n\t" // L20
1297 "movq %%mm1, %%mm3 \n\t" // L00 1297 "movq %%mm1, %%mm3 \n\t" // L00
1298 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 1298 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1299 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 1299 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1300 "psubusb %%mm7, %%mm0 \n\t" 1300 "psubusb %%mm7, %%mm0 \n\t"
1301 "psubusb %%mm7, %%mm2 \n\t" 1301 "psubusb %%mm7, %%mm2 \n\t"
1302 "psubusb %%mm7, %%mm3 \n\t" 1302 "psubusb %%mm7, %%mm3 \n\t"
1303 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 1303 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1304 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 1304 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1305 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 1305 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1306 "paddb %%mm2, %%mm0 \n\t" 1306 "paddb %%mm2, %%mm0 \n\t"
1307 "paddb %%mm3, %%mm0 \n\t" 1307 "paddb %%mm3, %%mm0 \n\t"
1308 1308
1309 "movq (%%"REG_a"), %%mm2 \n\t" // L11 1309 "movq (%%"REG_a"), %%mm2 \n\t" // L11
1310 "movq %%mm2, %%mm3 \n\t" // L11 1310 "movq %%mm2, %%mm3 \n\t" // L11
1311 "movq %%mm2, %%mm4 \n\t" // L11 1311 "movq %%mm2, %%mm4 \n\t" // L11
1312 "psllq $8, %%mm3 \n\t" 1312 "psllq $8, %%mm3 \n\t"
1313 "psrlq $8, %%mm4 \n\t" 1313 "psrlq $8, %%mm4 \n\t"
1314 "movd -4(%%"REG_a"), %%mm5 \n\t" 1314 "movd -4(%%"REG_a"), %%mm5 \n\t"
1315 "movd 8(%%"REG_a"), %%mm6 \n\t" 1315 "movd 8(%%"REG_a"), %%mm6 \n\t"
1316 "psrlq $24, %%mm5 \n\t" 1316 "psrlq $24, %%mm5 \n\t"
1317 "psllq $56, %%mm6 \n\t" 1317 "psllq $56, %%mm6 \n\t"
1318 "por %%mm5, %%mm3 \n\t" // L01 1318 "por %%mm5, %%mm3 \n\t" // L01
1319 "por %%mm6, %%mm4 \n\t" // L21 1319 "por %%mm6, %%mm4 \n\t" // L21
1320 "movq %%mm3, %%mm5 \n\t" // L01 1320 "movq %%mm3, %%mm5 \n\t" // L01
1321 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 1321 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1322 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 1322 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1323 "psubusb %%mm7, %%mm2 \n\t" 1323 "psubusb %%mm7, %%mm2 \n\t"
1324 "psubusb %%mm7, %%mm4 \n\t" 1324 "psubusb %%mm7, %%mm4 \n\t"
1325 "psubusb %%mm7, %%mm5 \n\t" 1325 "psubusb %%mm7, %%mm5 \n\t"
1326 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 1326 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1327 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 1327 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1328 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 1328 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1329 "paddb %%mm4, %%mm2 \n\t" 1329 "paddb %%mm4, %%mm2 \n\t"
1330 "paddb %%mm5, %%mm2 \n\t" 1330 "paddb %%mm5, %%mm2 \n\t"
1331 // 0, 2, 3, 1 1331 // 0, 2, 3, 1
1332 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1332 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1333 "movq " #src ", " #sx " \n\t" /* src[0] */\ 1333 "movq " #src ", " #sx " \n\t" /* src[0] */\
1334 "movq " #sx ", " #lx " \n\t" /* src[0] */\ 1334 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1335 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ 1335 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1336 "psllq $8, " #lx " \n\t"\ 1336 "psllq $8, " #lx " \n\t"\
1337 "psrlq $8, " #t0 " \n\t"\ 1337 "psrlq $8, " #t0 " \n\t"\
1338 "movd -4" #src ", " #t1 " \n\t"\ 1338 "movd -4" #src ", " #t1 " \n\t"\
1339 "psrlq $24, " #t1 " \n\t"\ 1339 "psrlq $24, " #t1 " \n\t"\
1340 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ 1340 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1341 "movd 8" #src ", " #t1 " \n\t"\ 1341 "movd 8" #src ", " #t1 " \n\t"\
1342 "psllq $56, " #t1 " \n\t"\ 1342 "psllq $56, " #t1 " \n\t"\
1343 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ 1343 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1344 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ 1344 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1345 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ 1345 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1346 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ 1346 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1347 PAVGB(lx, pplx) \ 1347 PAVGB(lx, pplx) \
1348 "movq " #lx ", 8(%%"REG_c") \n\t"\ 1348 "movq " #lx ", 8(%%"REG_c") \n\t"\
1349 "movq (%%"REG_c"), " #lx " \n\t"\ 1349 "movq (%%"REG_c"), " #lx " \n\t"\
1350 "psubusb " #lx ", " #t1 " \n\t"\ 1350 "psubusb " #lx ", " #t1 " \n\t"\
1351 "psubusb " #lx ", " #t0 " \n\t"\ 1351 "psubusb " #lx ", " #t0 " \n\t"\
1352 "psubusb " #lx ", " #sx " \n\t"\ 1352 "psubusb " #lx ", " #sx " \n\t"\
1353 "movq "MANGLE(b00)", " #lx " \n\t"\ 1353 "movq "MANGLE(b00)", " #lx " \n\t"\
1354 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ 1354 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1355 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ 1355 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1356 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ 1356 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1357 "paddb " #t1 ", " #t0 " \n\t"\ 1357 "paddb " #t1 ", " #t0 " \n\t"\
1358 "paddb " #t0 ", " #sx " \n\t"\ 1358 "paddb " #t0 ", " #sx " \n\t"\
1359 \ 1359 \
1360 PAVGB(plx, pplx) /* filtered */\ 1360 PAVGB(plx, pplx) /* filtered */\
1361 "movq " #dst ", " #t0 " \n\t" /* dst */\ 1361 "movq " #dst ", " #t0 " \n\t" /* dst */\
1362 "movq " #t0 ", " #t1 " \n\t" /* dst */\ 1362 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1363 "psubusb %3, " #t0 " \n\t"\ 1363 "psubusb %3, " #t0 " \n\t"\
1364 "paddusb %3, " #t1 " \n\t"\ 1364 "paddusb %3, " #t1 " \n\t"\
1365 PMAXUB(t0, pplx)\ 1365 PMAXUB(t0, pplx)\
1366 PMINUB(t1, pplx, t0)\ 1366 PMINUB(t1, pplx, t0)\
1367 "paddb " #sx ", " #ppsx " \n\t"\ 1367 "paddb " #sx ", " #ppsx " \n\t"\
1368 "paddb " #psx ", " #ppsx " \n\t"\ 1368 "paddb " #psx ", " #ppsx " \n\t"\
1369 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ 1369 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1370 "pand "MANGLE(b08)", " #ppsx " \n\t"\ 1370 "pand "MANGLE(b08)", " #ppsx " \n\t"\
1371 "pcmpeqb " #lx ", " #ppsx " \n\t"\ 1371 "pcmpeqb " #lx ", " #ppsx " \n\t"\
1372 "pand " #ppsx ", " #pplx " \n\t"\ 1372 "pand " #ppsx ", " #pplx " \n\t"\
1373 "pandn " #dst ", " #ppsx " \n\t"\ 1373 "pandn " #dst ", " #ppsx " \n\t"\
1374 "por " #pplx ", " #ppsx " \n\t"\ 1374 "por " #pplx ", " #ppsx " \n\t"\
1375 "movq " #ppsx ", " #dst " \n\t"\ 1375 "movq " #ppsx ", " #dst " \n\t"\
1376 "movq 8(%%"REG_c"), " #lx " \n\t" 1376 "movq 8(%%"REG_c"), " #lx " \n\t"
1377 1377
1378 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1378 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1379 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) 1379 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1380 /* 1380 /*
1381 0000000 1381 0000000
1390 1390
1391 1111000 1391 1111000
1392 1110111 1392 1110111
1393 1393
1394 */ 1394 */
1395 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) 1395 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1396 DERING_CORE((%%REGa),(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1396 DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1397 DERING_CORE((%%REGa, %1),(%%REGa, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1397 DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1398 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1398 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1399 DERING_CORE((%0, %1, 4),(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1399 DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1400 DERING_CORE((%%REGd),(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1400 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1401 DERING_CORE((%%REGd, %1), (%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1401 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1402 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1402 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1403 DERING_CORE((%0, %1, 8),(%%REGd, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1403 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1404 1404
1405 "1: \n\t" 1405 "1: \n\t"
1406 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2) 1406 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
1407 : "%"REG_a, "%"REG_d, "%"REG_c 1407 : "%"REG_a, "%"REG_d, "%"REG_c
1408 ); 1408 );
1409 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1409 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1410 int y; 1410 int y;
1411 int min=255; 1411 int min=255;
1412 int max=0; 1412 int max=0;
1413 int avg; 1413 int avg;
1414 uint8_t *p; 1414 uint8_t *p;
1415 int s[10]; 1415 int s[10];
1416 const int QP2= c->QP/2 + 1; 1416 const int QP2= c->QP/2 + 1;
1417 1417
1418 for(y=1; y<9; y++) 1418 for(y=1; y<9; y++)
1419 { 1419 {
1420 int x; 1420 int x;
1421 p= src + stride*y; 1421 p= src + stride*y;
1422 for(x=1; x<9; x++) 1422 for(x=1; x<9; x++)
1423 { 1423 {
1424 p++; 1424 p++;
1425 if(*p > max) max= *p; 1425 if(*p > max) max= *p;
1426 if(*p < min) min= *p; 1426 if(*p < min) min= *p;
1427 } 1427 }
1428 } 1428 }
1429 avg= (min + max + 1)>>1; 1429 avg= (min + max + 1)>>1;
1430 1430
1431 if(max - min <deringThreshold) return; 1431 if(max - min <deringThreshold) return;
1432 1432
1433 for(y=0; y<10; y++) 1433 for(y=0; y<10; y++)
1434 { 1434 {
1435 int t = 0; 1435 int t = 0;
1436 1436
1437 if(src[stride*y + 0] > avg) t+= 1; 1437 if(src[stride*y + 0] > avg) t+= 1;
1438 if(src[stride*y + 1] > avg) t+= 2; 1438 if(src[stride*y + 1] > avg) t+= 2;
1439 if(src[stride*y + 2] > avg) t+= 4; 1439 if(src[stride*y + 2] > avg) t+= 4;
1440 if(src[stride*y + 3] > avg) t+= 8; 1440 if(src[stride*y + 3] > avg) t+= 8;
1441 if(src[stride*y + 4] > avg) t+= 16; 1441 if(src[stride*y + 4] > avg) t+= 16;
1442 if(src[stride*y + 5] > avg) t+= 32; 1442 if(src[stride*y + 5] > avg) t+= 32;
1443 if(src[stride*y + 6] > avg) t+= 64; 1443 if(src[stride*y + 6] > avg) t+= 64;
1444 if(src[stride*y + 7] > avg) t+= 128; 1444 if(src[stride*y + 7] > avg) t+= 128;
1445 if(src[stride*y + 8] > avg) t+= 256; 1445 if(src[stride*y + 8] > avg) t+= 256;
1446 if(src[stride*y + 9] > avg) t+= 512; 1446 if(src[stride*y + 9] > avg) t+= 512;
1447 1447
1448 t |= (~t)<<16; 1448 t |= (~t)<<16;
1449 t &= (t<<1) & (t>>1); 1449 t &= (t<<1) & (t>>1);
1450 s[y] = t; 1450 s[y] = t;
1451 } 1451 }
1452 1452
1453 for(y=1; y<9; y++) 1453 for(y=1; y<9; y++)
1454 { 1454 {
1455 int t = s[y-1] & s[y] & s[y+1]; 1455 int t = s[y-1] & s[y] & s[y+1];
1456 t|= t>>16; 1456 t|= t>>16;
1457 s[y-1]= t; 1457 s[y-1]= t;
1458 } 1458 }
1459 1459
1460 for(y=1; y<9; y++) 1460 for(y=1; y<9; y++)
1461 { 1461 {
1462 int x; 1462 int x;
1463 int t = s[y-1]; 1463 int t = s[y-1];
1464 1464
1465 p= src + stride*y; 1465 p= src + stride*y;
1466 for(x=1; x<9; x++) 1466 for(x=1; x<9; x++)
1467 { 1467 {
1468 p++; 1468 p++;
1469 if(t & (1<<x)) 1469 if(t & (1<<x))
1470 { 1470 {
1471 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) 1471 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1472 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) 1472 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1473 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); 1473 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1474 f= (f + 8)>>4; 1474 f= (f + 8)>>4;
1475 1475
1476 #ifdef DEBUG_DERING_THRESHOLD 1476 #ifdef DEBUG_DERING_THRESHOLD
1477 asm volatile("emms\n\t":); 1477 asm volatile("emms\n\t":);
1478 { 1478 {
1479 static long long numPixels=0; 1479 static long long numPixels=0;
1480 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; 1480 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1481 // if((max-min)<20 || (max-min)*QP<200) 1481 // if((max-min)<20 || (max-min)*QP<200)
1482 // if((max-min)*QP < 500) 1482 // if((max-min)*QP < 500)
1483 // if(max-min<QP/2) 1483 // if(max-min<QP/2)
1484 if(max-min < 20) 1484 if(max-min < 20)
1485 { 1485 {
1486 static int numSkiped=0; 1486 static int numSkiped=0;
1487 static int errorSum=0; 1487 static int errorSum=0;
1488 static int worstQP=0; 1488 static int worstQP=0;
1489 static int worstRange=0; 1489 static int worstRange=0;
1490 static int worstDiff=0; 1490 static int worstDiff=0;
1491 int diff= (f - *p); 1491 int diff= (f - *p);
1492 int absDiff= ABS(diff); 1492 int absDiff= ABS(diff);
1493 int error= diff*diff; 1493 int error= diff*diff;
1494 1494
1495 if(x==1 || x==8 || y==1 || y==8) continue; 1495 if(x==1 || x==8 || y==1 || y==8) continue;
1496 1496
1497 numSkiped++; 1497 numSkiped++;
1498 if(absDiff > worstDiff) 1498 if(absDiff > worstDiff)
1499 { 1499 {
1500 worstDiff= absDiff; 1500 worstDiff= absDiff;
1501 worstQP= QP; 1501 worstQP= QP;
1502 worstRange= max-min; 1502 worstRange= max-min;
1503 } 1503 }
1504 errorSum+= error; 1504 errorSum+= error;
1505 1505
1506 if(1024LL*1024LL*1024LL % numSkiped == 0) 1506 if(1024LL*1024LL*1024LL % numSkiped == 0)
1507 { 1507 {
1508 printf( "sum:%1.3f, skip:%d, wQP:%d, " 1508 printf( "sum:%1.3f, skip:%d, wQP:%d, "
1509 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", 1509 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1510 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, 1510 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1511 worstDiff, (float)numSkiped/numPixels); 1511 worstDiff, (float)numSkiped/numPixels);
1512 } 1512 }
1513 } 1513 }
1514 } 1514 }
1515 #endif 1515 #endif
1516 if (*p + QP2 < f) *p= *p + QP2; 1516 if (*p + QP2 < f) *p= *p + QP2;
1517 else if(*p - QP2 > f) *p= *p - QP2; 1517 else if(*p - QP2 > f) *p= *p - QP2;
1518 else *p=f; 1518 else *p=f;
1519 } 1519 }
1520 } 1520 }
1521 } 1521 }
1522 #ifdef DEBUG_DERING_THRESHOLD 1522 #ifdef DEBUG_DERING_THRESHOLD
1523 if(max-min < 20) 1523 if(max-min < 20)
1524 { 1524 {
1525 for(y=1; y<9; y++) 1525 for(y=1; y<9; y++)
1526 { 1526 {
1527 int x; 1527 int x;
1528 int t = 0; 1528 int t = 0;
1529 p= src + stride*y; 1529 p= src + stride*y;
1530 for(x=1; x<9; x++) 1530 for(x=1; x<9; x++)
1531 { 1531 {
1532 p++; 1532 p++;
1533 *p = MIN(*p + 20, 255); 1533 *p = MIN(*p + 20, 255);
1534 } 1534 }
1535 } 1535 }
1536 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; 1536 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1537 } 1537 }
1538 #endif 1538 #endif
1539 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1539 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1540 } 1540 }
1541 #endif //HAVE_ALTIVEC 1541 #endif //HAVE_ALTIVEC
1542 1542
1547 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1547 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1548 */ 1548 */
1549 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 1549 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1550 { 1550 {
1551 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1551 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1552 src+= 4*stride; 1552 src+= 4*stride;
1553 asm volatile( 1553 asm volatile(
1554 "lea (%0, %1), %%"REG_a" \n\t" 1554 "lea (%0, %1), %%"REG_a" \n\t"
1555 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 1555 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
1556 // 0 1 2 3 4 5 6 7 8 9 1556 // 0 1 2 3 4 5 6 7 8 9
1557 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 1557 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1558 1558
1559 "movq (%0), %%mm0 \n\t" 1559 "movq (%0), %%mm0 \n\t"
1560 "movq (%%"REG_a", %1), %%mm1 \n\t" 1560 "movq (%%"REG_a", %1), %%mm1 \n\t"
1561 PAVGB(%%mm1, %%mm0) 1561 PAVGB(%%mm1, %%mm0)
1562 "movq %%mm0, (%%"REG_a") \n\t" 1562 "movq %%mm0, (%%"REG_a") \n\t"
1563 "movq (%0, %1, 4), %%mm0 \n\t" 1563 "movq (%0, %1, 4), %%mm0 \n\t"
1564 PAVGB(%%mm0, %%mm1) 1564 PAVGB(%%mm0, %%mm1)
1565 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" 1565 "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
1566 "movq (%%"REG_c", %1), %%mm1 \n\t" 1566 "movq (%%"REG_c", %1), %%mm1 \n\t"
1567 PAVGB(%%mm1, %%mm0) 1567 PAVGB(%%mm1, %%mm0)
1568 "movq %%mm0, (%%"REG_c") \n\t" 1568 "movq %%mm0, (%%"REG_c") \n\t"
1569 "movq (%0, %1, 8), %%mm0 \n\t" 1569 "movq (%0, %1, 8), %%mm0 \n\t"
1570 PAVGB(%%mm0, %%mm1) 1570 PAVGB(%%mm0, %%mm1)
1571 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" 1571 "movq %%mm1, (%%"REG_c", %1, 2) \n\t"
1572 1572
1573 : : "r" (src), "r" ((long)stride) 1573 : : "r" (src), "r" ((long)stride)
1574 : "%"REG_a, "%"REG_c 1574 : "%"REG_a, "%"REG_c
1575 ); 1575 );
1576 #else 1576 #else
1577 int a, b, x; 1577 int a, b, x;
1578 src+= 4*stride; 1578 src+= 4*stride;
1579 1579
1580 for(x=0; x<2; x++){ 1580 for(x=0; x<2; x++){
1581 a= *(uint32_t*)&src[stride*0]; 1581 a= *(uint32_t*)&src[stride*0];
1582 b= *(uint32_t*)&src[stride*2]; 1582 b= *(uint32_t*)&src[stride*2];
1583 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1583 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1584 a= *(uint32_t*)&src[stride*4]; 1584 a= *(uint32_t*)&src[stride*4];
1585 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1585 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1586 b= *(uint32_t*)&src[stride*6]; 1586 b= *(uint32_t*)&src[stride*6];
1587 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1587 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1588 a= *(uint32_t*)&src[stride*8]; 1588 a= *(uint32_t*)&src[stride*8];
1589 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1589 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1590 src += 4; 1590 src += 4;
1591 } 1591 }
1592 #endif 1592 #endif
1593 } 1593 }
1594 1594
1595 /** 1595 /**
1596 * Deinterlaces the given block by cubic interpolating every second line. 1596 * Deinterlaces the given block by cubic interpolating every second line.
1600 * this filter will read lines 3-15 and write 7-13 1600 * this filter will read lines 3-15 and write 7-13
1601 */ 1601 */
1602 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 1602 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1603 { 1603 {
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1605 src+= stride*3; 1605 src+= stride*3;
1606 asm volatile( 1606 asm volatile(
1607 "lea (%0, %1), %%"REG_a" \n\t" 1607 "lea (%0, %1), %%"REG_a" \n\t"
1608 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1608 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1609 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 1609 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
1610 "add %1, %%"REG_c" \n\t" 1610 "add %1, %%"REG_c" \n\t"
1611 "pxor %%mm7, %%mm7 \n\t" 1611 "pxor %%mm7, %%mm7 \n\t"
1612 // 0 1 2 3 4 5 6 7 8 9 10 1612 // 0 1 2 3 4 5 6 7 8 9 10
1613 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1613 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1614 1614
1615 #define REAL_DEINT_CUBIC(a,b,c,d,e)\ 1615 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1616 "movq " #a ", %%mm0 \n\t"\ 1616 "movq " #a ", %%mm0 \n\t"\
1617 "movq " #b ", %%mm1 \n\t"\ 1617 "movq " #b ", %%mm1 \n\t"\
1618 "movq " #d ", %%mm2 \n\t"\ 1618 "movq " #d ", %%mm2 \n\t"\
1619 "movq " #e ", %%mm3 \n\t"\ 1619 "movq " #e ", %%mm3 \n\t"\
1620 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ 1620 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1621 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ 1621 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1622 "movq %%mm0, %%mm2 \n\t"\ 1622 "movq %%mm0, %%mm2 \n\t"\
1623 "punpcklbw %%mm7, %%mm0 \n\t"\ 1623 "punpcklbw %%mm7, %%mm0 \n\t"\
1624 "punpckhbw %%mm7, %%mm2 \n\t"\ 1624 "punpckhbw %%mm7, %%mm2 \n\t"\
1625 "movq %%mm1, %%mm3 \n\t"\ 1625 "movq %%mm1, %%mm3 \n\t"\
1626 "punpcklbw %%mm7, %%mm1 \n\t"\ 1626 "punpcklbw %%mm7, %%mm1 \n\t"\
1627 "punpckhbw %%mm7, %%mm3 \n\t"\ 1627 "punpckhbw %%mm7, %%mm3 \n\t"\
1628 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ 1628 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1629 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ 1629 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1630 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ 1630 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1631 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ 1631 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1632 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ 1632 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1633 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ 1633 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1634 "packuswb %%mm3, %%mm1 \n\t"\ 1634 "packuswb %%mm3, %%mm1 \n\t"\
1635 "movq %%mm1, " #c " \n\t" 1635 "movq %%mm1, " #c " \n\t"
1636 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) 1636 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1637 1637
1638 DEINT_CUBIC((%0), (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd, %1)) 1638 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
1639 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4), (%%REGd), (%%REGd, %1), (%0, %1, 8)) 1639 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
1640 DEINT_CUBIC((%0, %1, 4), (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGc)) 1640 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1641 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8), (%%REGd, %1, 4), (%%REGc), (%%REGc, %1, 2)) 1641 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
1642 1642
1643 : : "r" (src), "r" ((long)stride) 1643 : : "r" (src), "r" ((long)stride)
1644 : "%"REG_a, "%"REG_d, "%"REG_c 1644 : "%"REG_a, "%"REG_d, "%"REG_c
1645 ); 1645 );
1646 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1646 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1647 int x; 1647 int x;
1648 src+= stride*3; 1648 src+= stride*3;
1649 for(x=0; x<8; x++) 1649 for(x=0; x<8; x++)
1650 { 1650 {
1651 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 1651 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1652 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 1652 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1653 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 1653 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1654 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 1654 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1655 src++; 1655 src++;
1656 } 1656 }
1657 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1657 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1658 } 1658 }
1659 1659
1660 /** 1660 /**
1661 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. 1661 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1665 * this filter will read lines 4-13 and write 5-11 1665 * this filter will read lines 4-13 and write 5-11
1666 */ 1666 */
1667 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 1667 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1668 { 1668 {
1669 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1669 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1670 src+= stride*4; 1670 src+= stride*4;
1671 asm volatile( 1671 asm volatile(
1672 "lea (%0, %1), %%"REG_a" \n\t" 1672 "lea (%0, %1), %%"REG_a" \n\t"
1673 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1673 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1674 "pxor %%mm7, %%mm7 \n\t" 1674 "pxor %%mm7, %%mm7 \n\t"
1675 "movq (%2), %%mm0 \n\t" 1675 "movq (%2), %%mm0 \n\t"
1676 // 0 1 2 3 4 5 6 7 8 9 10 1676 // 0 1 2 3 4 5 6 7 8 9 10
1677 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1677 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1678 1678
1679 #define REAL_DEINT_FF(a,b,c,d)\ 1679 #define REAL_DEINT_FF(a,b,c,d)\
1680 "movq " #a ", %%mm1 \n\t"\ 1680 "movq " #a ", %%mm1 \n\t"\
1681 "movq " #b ", %%mm2 \n\t"\ 1681 "movq " #b ", %%mm2 \n\t"\
1682 "movq " #c ", %%mm3 \n\t"\ 1682 "movq " #c ", %%mm3 \n\t"\
1683 "movq " #d ", %%mm4 \n\t"\ 1683 "movq " #d ", %%mm4 \n\t"\
1684 PAVGB(%%mm3, %%mm1) \ 1684 PAVGB(%%mm3, %%mm1) \
1685 PAVGB(%%mm4, %%mm0) \ 1685 PAVGB(%%mm4, %%mm0) \
1686 "movq %%mm0, %%mm3 \n\t"\ 1686 "movq %%mm0, %%mm3 \n\t"\
1687 "punpcklbw %%mm7, %%mm0 \n\t"\ 1687 "punpcklbw %%mm7, %%mm0 \n\t"\
1688 "punpckhbw %%mm7, %%mm3 \n\t"\ 1688 "punpckhbw %%mm7, %%mm3 \n\t"\
1689 "movq %%mm1, %%mm4 \n\t"\ 1689 "movq %%mm1, %%mm4 \n\t"\
1690 "punpcklbw %%mm7, %%mm1 \n\t"\ 1690 "punpcklbw %%mm7, %%mm1 \n\t"\
1691 "punpckhbw %%mm7, %%mm4 \n\t"\ 1691 "punpckhbw %%mm7, %%mm4 \n\t"\
1692 "psllw $2, %%mm1 \n\t"\ 1692 "psllw $2, %%mm1 \n\t"\
1693 "psllw $2, %%mm4 \n\t"\ 1693 "psllw $2, %%mm4 \n\t"\
1694 "psubw %%mm0, %%mm1 \n\t"\ 1694 "psubw %%mm0, %%mm1 \n\t"\
1695 "psubw %%mm3, %%mm4 \n\t"\ 1695 "psubw %%mm3, %%mm4 \n\t"\
1696 "movq %%mm2, %%mm5 \n\t"\ 1696 "movq %%mm2, %%mm5 \n\t"\
1697 "movq %%mm2, %%mm0 \n\t"\ 1697 "movq %%mm2, %%mm0 \n\t"\
1698 "punpcklbw %%mm7, %%mm2 \n\t"\ 1698 "punpcklbw %%mm7, %%mm2 \n\t"\
1699 "punpckhbw %%mm7, %%mm5 \n\t"\ 1699 "punpckhbw %%mm7, %%mm5 \n\t"\
1700 "paddw %%mm2, %%mm1 \n\t"\ 1700 "paddw %%mm2, %%mm1 \n\t"\
1701 "paddw %%mm5, %%mm4 \n\t"\ 1701 "paddw %%mm5, %%mm4 \n\t"\
1702 "psraw $2, %%mm1 \n\t"\ 1702 "psraw $2, %%mm1 \n\t"\
1703 "psraw $2, %%mm4 \n\t"\ 1703 "psraw $2, %%mm4 \n\t"\
1704 "packuswb %%mm4, %%mm1 \n\t"\ 1704 "packuswb %%mm4, %%mm1 \n\t"\
1705 "movq %%mm1, " #b " \n\t"\ 1705 "movq %%mm1, " #b " \n\t"\
1706 1706
1707 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) 1707 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1708 1708
1709 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) 1709 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
1710 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd) ) 1710 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
1711 DEINT_FF((%0, %1, 4), (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) 1711 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
1712 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGd, %1, 4)) 1712 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1713 1713
1714 "movq %%mm0, (%2) \n\t" 1714 "movq %%mm0, (%2) \n\t"
1715 : : "r" (src), "r" ((long)stride), "r"(tmp) 1715 : : "r" (src), "r" ((long)stride), "r"(tmp)
1716 : "%"REG_a, "%"REG_d 1716 : "%"REG_a, "%"REG_d
1717 ); 1717 );
1718 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1718 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1719 int x; 1719 int x;
1720 src+= stride*4; 1720 src+= stride*4;
1721 for(x=0; x<8; x++) 1721 for(x=0; x<8; x++)
1722 { 1722 {
1723 int t1= tmp[x]; 1723 int t1= tmp[x];
1724 int t2= src[stride*1]; 1724 int t2= src[stride*1];
1725 1725
1726 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); 1726 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1727 t1= src[stride*4]; 1727 t1= src[stride*4];
1728 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); 1728 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1729 t2= src[stride*6]; 1729 t2= src[stride*6];
1730 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); 1730 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1731 t1= src[stride*8]; 1731 t1= src[stride*8];
1732 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 1732 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1733 tmp[x]= t1; 1733 tmp[x]= t1;
1734 1734
1735 src++; 1735 src++;
1736 } 1736 }
1737 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1737 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1738 } 1738 }
1739 1739
1740 /** 1740 /**
1741 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. 1741 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1745 * this filter will read lines 4-13 and write 4-11 1745 * this filter will read lines 4-13 and write 4-11
1746 */ 1746 */
1747 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 1747 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1748 { 1748 {
1749 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1749 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1750 src+= stride*4; 1750 src+= stride*4;
1751 asm volatile( 1751 asm volatile(
1752 "lea (%0, %1), %%"REG_a" \n\t" 1752 "lea (%0, %1), %%"REG_a" \n\t"
1753 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1753 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1754 "pxor %%mm7, %%mm7 \n\t" 1754 "pxor %%mm7, %%mm7 \n\t"
1755 "movq (%2), %%mm0 \n\t" 1755 "movq (%2), %%mm0 \n\t"
1756 "movq (%3), %%mm1 \n\t" 1756 "movq (%3), %%mm1 \n\t"
1757 // 0 1 2 3 4 5 6 7 8 9 10 1757 // 0 1 2 3 4 5 6 7 8 9 10
1758 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1758 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1759 1759
1760 #define REAL_DEINT_L5(t1,t2,a,b,c)\ 1760 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1761 "movq " #a ", %%mm2 \n\t"\ 1761 "movq " #a ", %%mm2 \n\t"\
1762 "movq " #b ", %%mm3 \n\t"\ 1762 "movq " #b ", %%mm3 \n\t"\
1763 "movq " #c ", %%mm4 \n\t"\ 1763 "movq " #c ", %%mm4 \n\t"\
1764 PAVGB(t2, %%mm3) \ 1764 PAVGB(t2, %%mm3) \
1765 PAVGB(t1, %%mm4) \ 1765 PAVGB(t1, %%mm4) \
1766 "movq %%mm2, %%mm5 \n\t"\ 1766 "movq %%mm2, %%mm5 \n\t"\
1767 "movq %%mm2, " #t1 " \n\t"\ 1767 "movq %%mm2, " #t1 " \n\t"\
1768 "punpcklbw %%mm7, %%mm2 \n\t"\ 1768 "punpcklbw %%mm7, %%mm2 \n\t"\
1769 "punpckhbw %%mm7, %%mm5 \n\t"\ 1769 "punpckhbw %%mm7, %%mm5 \n\t"\
1770 "movq %%mm2, %%mm6 \n\t"\ 1770 "movq %%mm2, %%mm6 \n\t"\
1771 "paddw %%mm2, %%mm2 \n\t"\ 1771 "paddw %%mm2, %%mm2 \n\t"\
1772 "paddw %%mm6, %%mm2 \n\t"\ 1772 "paddw %%mm6, %%mm2 \n\t"\
1773 "movq %%mm5, %%mm6 \n\t"\ 1773 "movq %%mm5, %%mm6 \n\t"\
1774 "paddw %%mm5, %%mm5 \n\t"\ 1774 "paddw %%mm5, %%mm5 \n\t"\
1775 "paddw %%mm6, %%mm5 \n\t"\ 1775 "paddw %%mm6, %%mm5 \n\t"\
1776 "movq %%mm3, %%mm6 \n\t"\ 1776 "movq %%mm3, %%mm6 \n\t"\
1777 "punpcklbw %%mm7, %%mm3 \n\t"\ 1777 "punpcklbw %%mm7, %%mm3 \n\t"\
1778 "punpckhbw %%mm7, %%mm6 \n\t"\ 1778 "punpckhbw %%mm7, %%mm6 \n\t"\
1779 "paddw %%mm3, %%mm3 \n\t"\ 1779 "paddw %%mm3, %%mm3 \n\t"\
1780 "paddw %%mm6, %%mm6 \n\t"\ 1780 "paddw %%mm6, %%mm6 \n\t"\
1781 "paddw %%mm3, %%mm2 \n\t"\ 1781 "paddw %%mm3, %%mm2 \n\t"\
1782 "paddw %%mm6, %%mm5 \n\t"\ 1782 "paddw %%mm6, %%mm5 \n\t"\
1783 "movq %%mm4, %%mm6 \n\t"\ 1783 "movq %%mm4, %%mm6 \n\t"\
1784 "punpcklbw %%mm7, %%mm4 \n\t"\ 1784 "punpcklbw %%mm7, %%mm4 \n\t"\
1785 "punpckhbw %%mm7, %%mm6 \n\t"\ 1785 "punpckhbw %%mm7, %%mm6 \n\t"\
1786 "psubw %%mm4, %%mm2 \n\t"\ 1786 "psubw %%mm4, %%mm2 \n\t"\
1787 "psubw %%mm6, %%mm5 \n\t"\ 1787 "psubw %%mm6, %%mm5 \n\t"\
1788 "psraw $2, %%mm2 \n\t"\ 1788 "psraw $2, %%mm2 \n\t"\
1789 "psraw $2, %%mm5 \n\t"\ 1789 "psraw $2, %%mm5 \n\t"\
1790 "packuswb %%mm5, %%mm2 \n\t"\ 1790 "packuswb %%mm5, %%mm2 \n\t"\
1791 "movq %%mm2, " #a " \n\t"\ 1791 "movq %%mm2, " #a " \n\t"\
1792 1792
1793 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) 1793 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1794 1794
1795 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) 1795 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
1796 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) 1796 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
1799 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) 1799 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
1800 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) 1800 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
1801 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) 1801 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
1802 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1802 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1803 1803
1804 "movq %%mm0, (%2) \n\t" 1804 "movq %%mm0, (%2) \n\t"
1805 "movq %%mm1, (%3) \n\t" 1805 "movq %%mm1, (%3) \n\t"
1806 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2) 1806 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
1807 : "%"REG_a, "%"REG_d 1807 : "%"REG_a, "%"REG_d
1808 ); 1808 );
1809 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1809 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1810 int x; 1810 int x;
1811 src+= stride*4; 1811 src+= stride*4;
1812 for(x=0; x<8; x++) 1812 for(x=0; x<8; x++)
1813 { 1813 {
1814 int t1= tmp[x]; 1814 int t1= tmp[x];
1815 int t2= tmp2[x]; 1815 int t2= tmp2[x];
1816 int t3= src[0]; 1816 int t3= src[0];
1817 1817
1818 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); 1818 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1819 t1= src[stride*1]; 1819 t1= src[stride*1];
1820 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); 1820 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1821 t2= src[stride*2]; 1821 t2= src[stride*2];
1822 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); 1822 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1823 t3= src[stride*3]; 1823 t3= src[stride*3];
1824 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); 1824 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1825 t1= src[stride*4]; 1825 t1= src[stride*4];
1826 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); 1826 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1827 t2= src[stride*5]; 1827 t2= src[stride*5];
1828 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); 1828 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1829 t3= src[stride*6]; 1829 t3= src[stride*6];
1830 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); 1830 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1831 t1= src[stride*7]; 1831 t1= src[stride*7];
1832 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); 1832 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1833 1833
1834 tmp[x]= t3; 1834 tmp[x]= t3;
1835 tmp2[x]= t1; 1835 tmp2[x]= t1;
1836 1836
1837 src++; 1837 src++;
1838 } 1838 }
1839 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1839 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1840 } 1840 }
1841 1841
1842 /** 1842 /**
1843 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. 1843 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1847 * this filter will read lines 4-13 and write 4-11 1847 * this filter will read lines 4-13 and write 4-11
1848 */ 1848 */
1849 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 1849 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1850 { 1850 {
1851 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1851 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1852 src+= 4*stride; 1852 src+= 4*stride;
1853 asm volatile( 1853 asm volatile(
1854 "lea (%0, %1), %%"REG_a" \n\t" 1854 "lea (%0, %1), %%"REG_a" \n\t"
1855 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1855 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1856 // 0 1 2 3 4 5 6 7 8 9 1856 // 0 1 2 3 4 5 6 7 8 9
1857 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1857 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1858 1858
1859 "movq (%2), %%mm0 \n\t" // L0 1859 "movq (%2), %%mm0 \n\t" // L0
1860 "movq (%%"REG_a"), %%mm1 \n\t" // L2 1860 "movq (%%"REG_a"), %%mm1 \n\t" // L2
1861 PAVGB(%%mm1, %%mm0) // L0+L2 1861 PAVGB(%%mm1, %%mm0) // L0+L2
1862 "movq (%0), %%mm2 \n\t" // L1 1862 "movq (%0), %%mm2 \n\t" // L1
1863 PAVGB(%%mm2, %%mm0) 1863 PAVGB(%%mm2, %%mm0)
1864 "movq %%mm0, (%0) \n\t" 1864 "movq %%mm0, (%0) \n\t"
1865 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 1865 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3
1866 PAVGB(%%mm0, %%mm2) // L1+L3 1866 PAVGB(%%mm0, %%mm2) // L1+L3
1867 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 1867 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1868 "movq %%mm2, (%%"REG_a") \n\t" 1868 "movq %%mm2, (%%"REG_a") \n\t"
1869 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 1869 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4
1870 PAVGB(%%mm2, %%mm1) // L2+L4 1870 PAVGB(%%mm2, %%mm1) // L2+L4
1871 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 1871 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1872 "movq %%mm1, (%%"REG_a", %1) \n\t" 1872 "movq %%mm1, (%%"REG_a", %1) \n\t"
1873 "movq (%0, %1, 4), %%mm1 \n\t" // L5 1873 "movq (%0, %1, 4), %%mm1 \n\t" // L5
1874 PAVGB(%%mm1, %%mm0) // L3+L5 1874 PAVGB(%%mm1, %%mm0) // L3+L5
1875 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 1875 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1876 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 1876 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
1877 "movq (%%"REG_d"), %%mm0 \n\t" // L6 1877 "movq (%%"REG_d"), %%mm0 \n\t" // L6
1878 PAVGB(%%mm0, %%mm2) // L4+L6 1878 PAVGB(%%mm0, %%mm2) // L4+L6
1879 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 1879 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1880 "movq %%mm2, (%0, %1, 4) \n\t" 1880 "movq %%mm2, (%0, %1, 4) \n\t"
1881 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 1881 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7
1882 PAVGB(%%mm2, %%mm1) // L5+L7 1882 PAVGB(%%mm2, %%mm1) // L5+L7
1883 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 1883 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1884 "movq %%mm1, (%%"REG_d") \n\t" 1884 "movq %%mm1, (%%"REG_d") \n\t"
1885 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 1885 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8
1886 PAVGB(%%mm1, %%mm0) // L6+L8 1886 PAVGB(%%mm1, %%mm0) // L6+L8
1887 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 1887 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1888 "movq %%mm0, (%%"REG_d", %1) \n\t" 1888 "movq %%mm0, (%%"REG_d", %1) \n\t"
1889 "movq (%0, %1, 8), %%mm0 \n\t" // L9 1889 "movq (%0, %1, 8), %%mm0 \n\t" // L9
1890 PAVGB(%%mm0, %%mm2) // L7+L9 1890 PAVGB(%%mm0, %%mm2) // L7+L9
1891 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 1891 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1892 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1892 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1893 "movq %%mm1, (%2) \n\t" 1893 "movq %%mm1, (%2) \n\t"
1894 1894
1895 : : "r" (src), "r" ((long)stride), "r" (tmp) 1895 : : "r" (src), "r" ((long)stride), "r" (tmp)
1896 : "%"REG_a, "%"REG_d 1896 : "%"REG_a, "%"REG_d
1897 ); 1897 );
1898 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1898 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1899 int a, b, c, x; 1899 int a, b, c, x;
1900 src+= 4*stride; 1900 src+= 4*stride;
1901 1901
1902 for(x=0; x<2; x++){ 1902 for(x=0; x<2; x++){
1903 a= *(uint32_t*)&tmp[stride*0]; 1903 a= *(uint32_t*)&tmp[stride*0];
1904 b= *(uint32_t*)&src[stride*0]; 1904 b= *(uint32_t*)&src[stride*0];
1905 c= *(uint32_t*)&src[stride*1]; 1905 c= *(uint32_t*)&src[stride*1];
1906 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1906 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1907 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1907 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1908 1908
1909 a= *(uint32_t*)&src[stride*2]; 1909 a= *(uint32_t*)&src[stride*2];
1910 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1910 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1911 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1911 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1912 1912
1913 b= *(uint32_t*)&src[stride*3]; 1913 b= *(uint32_t*)&src[stride*3];
1914 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1914 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1915 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1915 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1916 1916
1917 c= *(uint32_t*)&src[stride*4]; 1917 c= *(uint32_t*)&src[stride*4];
1918 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1918 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1919 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1919 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1920 1920
1921 a= *(uint32_t*)&src[stride*5]; 1921 a= *(uint32_t*)&src[stride*5];
1922 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1922 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1923 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1923 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1924 1924
1925 b= *(uint32_t*)&src[stride*6]; 1925 b= *(uint32_t*)&src[stride*6];
1926 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1926 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1927 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1927 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1928 1928
1929 c= *(uint32_t*)&src[stride*7]; 1929 c= *(uint32_t*)&src[stride*7];
1930 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1930 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1931 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1931 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1932 1932
1933 a= *(uint32_t*)&src[stride*8]; 1933 a= *(uint32_t*)&src[stride*8];
1934 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1934 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1935 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1935 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1936 1936
1937 *(uint32_t*)&tmp[stride*0]= c; 1937 *(uint32_t*)&tmp[stride*0]= c;
1938 src += 4; 1938 src += 4;
1939 tmp += 4; 1939 tmp += 4;
1940 } 1940 }
1941 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1941 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1942 } 1942 }
1943 1943
1944 /** 1944 /**
1945 * Deinterlaces the given block by applying a median filter to every second line. 1945 * Deinterlaces the given block by applying a median filter to every second line.
1948 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1948 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1949 */ 1949 */
1950 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) 1950 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1951 { 1951 {
1952 #ifdef HAVE_MMX 1952 #ifdef HAVE_MMX
1953 src+= 4*stride; 1953 src+= 4*stride;
1954 #ifdef HAVE_MMX2 1954 #ifdef HAVE_MMX2
1955 asm volatile( 1955 asm volatile(
1956 "lea (%0, %1), %%"REG_a" \n\t" 1956 "lea (%0, %1), %%"REG_a" \n\t"
1957 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1957 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1958 // 0 1 2 3 4 5 6 7 8 9 1958 // 0 1 2 3 4 5 6 7 8 9
1959 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1959 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1960 1960
1961 "movq (%0), %%mm0 \n\t" // 1961 "movq (%0), %%mm0 \n\t" //
1962 "movq (%%"REG_a", %1), %%mm2 \n\t" // 1962 "movq (%%"REG_a", %1), %%mm2 \n\t" //
1963 "movq (%%"REG_a"), %%mm1 \n\t" // 1963 "movq (%%"REG_a"), %%mm1 \n\t" //
1964 "movq %%mm0, %%mm3 \n\t" 1964 "movq %%mm0, %%mm3 \n\t"
1965 "pmaxub %%mm1, %%mm0 \n\t" // 1965 "pmaxub %%mm1, %%mm0 \n\t" //
1966 "pminub %%mm3, %%mm1 \n\t" // 1966 "pminub %%mm3, %%mm1 \n\t" //
1967 "pmaxub %%mm2, %%mm1 \n\t" // 1967 "pmaxub %%mm2, %%mm1 \n\t" //
1968 "pminub %%mm1, %%mm0 \n\t" 1968 "pminub %%mm1, %%mm0 \n\t"
1969 "movq %%mm0, (%%"REG_a") \n\t" 1969 "movq %%mm0, (%%"REG_a") \n\t"
1970 1970
1971 "movq (%0, %1, 4), %%mm0 \n\t" // 1971 "movq (%0, %1, 4), %%mm0 \n\t" //
1972 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // 1972 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" //
1973 "movq %%mm2, %%mm3 \n\t" 1973 "movq %%mm2, %%mm3 \n\t"
1974 "pmaxub %%mm1, %%mm2 \n\t" // 1974 "pmaxub %%mm1, %%mm2 \n\t" //
1975 "pminub %%mm3, %%mm1 \n\t" // 1975 "pminub %%mm3, %%mm1 \n\t" //
1976 "pmaxub %%mm0, %%mm1 \n\t" // 1976 "pmaxub %%mm0, %%mm1 \n\t" //
1977 "pminub %%mm1, %%mm2 \n\t" 1977 "pminub %%mm1, %%mm2 \n\t"
1978 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 1978 "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
1979 1979
1980 "movq (%%"REG_d"), %%mm2 \n\t" // 1980 "movq (%%"REG_d"), %%mm2 \n\t" //
1981 "movq (%%"REG_d", %1), %%mm1 \n\t" // 1981 "movq (%%"REG_d", %1), %%mm1 \n\t" //
1982 "movq %%mm2, %%mm3 \n\t" 1982 "movq %%mm2, %%mm3 \n\t"
1983 "pmaxub %%mm0, %%mm2 \n\t" // 1983 "pmaxub %%mm0, %%mm2 \n\t" //
1984 "pminub %%mm3, %%mm0 \n\t" // 1984 "pminub %%mm3, %%mm0 \n\t" //
1985 "pmaxub %%mm1, %%mm0 \n\t" // 1985 "pmaxub %%mm1, %%mm0 \n\t" //
1986 "pminub %%mm0, %%mm2 \n\t" 1986 "pminub %%mm0, %%mm2 \n\t"
1987 "movq %%mm2, (%%"REG_d") \n\t" 1987 "movq %%mm2, (%%"REG_d") \n\t"
1988 1988
1989 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // 1989 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" //
1990 "movq (%0, %1, 8), %%mm0 \n\t" // 1990 "movq (%0, %1, 8), %%mm0 \n\t" //
1991 "movq %%mm2, %%mm3 \n\t" 1991 "movq %%mm2, %%mm3 \n\t"
1992 "pmaxub %%mm0, %%mm2 \n\t" // 1992 "pmaxub %%mm0, %%mm2 \n\t" //
1993 "pminub %%mm3, %%mm0 \n\t" // 1993 "pminub %%mm3, %%mm0 \n\t" //
1994 "pmaxub %%mm1, %%mm0 \n\t" // 1994 "pmaxub %%mm1, %%mm0 \n\t" //
1995 "pminub %%mm0, %%mm2 \n\t" 1995 "pminub %%mm0, %%mm2 \n\t"
1996 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1996 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1997 1997
1998 1998
1999 : : "r" (src), "r" ((long)stride) 1999 : : "r" (src), "r" ((long)stride)
2000 : "%"REG_a, "%"REG_d 2000 : "%"REG_a, "%"REG_d
2001 ); 2001 );
2002 2002
2003 #else // MMX without MMX2 2003 #else // MMX without MMX2
2004 asm volatile( 2004 asm volatile(
2005 "lea (%0, %1), %%"REG_a" \n\t" 2005 "lea (%0, %1), %%"REG_a" \n\t"
2006 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 2006 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
2007 // 0 1 2 3 4 5 6 7 8 9 2007 // 0 1 2 3 4 5 6 7 8 9
2008 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2008 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2009 "pxor %%mm7, %%mm7 \n\t" 2009 "pxor %%mm7, %%mm7 \n\t"
2010 2010
2011 #define REAL_MEDIAN(a,b,c)\ 2011 #define REAL_MEDIAN(a,b,c)\
2012 "movq " #a ", %%mm0 \n\t"\ 2012 "movq " #a ", %%mm0 \n\t"\
2013 "movq " #b ", %%mm2 \n\t"\ 2013 "movq " #b ", %%mm2 \n\t"\
2014 "movq " #c ", %%mm1 \n\t"\ 2014 "movq " #c ", %%mm1 \n\t"\
2015 "movq %%mm0, %%mm3 \n\t"\ 2015 "movq %%mm0, %%mm3 \n\t"\
2016 "movq %%mm1, %%mm4 \n\t"\ 2016 "movq %%mm1, %%mm4 \n\t"\
2017 "movq %%mm2, %%mm5 \n\t"\ 2017 "movq %%mm2, %%mm5 \n\t"\
2018 "psubusb %%mm1, %%mm3 \n\t"\ 2018 "psubusb %%mm1, %%mm3 \n\t"\
2019 "psubusb %%mm2, %%mm4 \n\t"\ 2019 "psubusb %%mm2, %%mm4 \n\t"\
2020 "psubusb %%mm0, %%mm5 \n\t"\ 2020 "psubusb %%mm0, %%mm5 \n\t"\
2021 "pcmpeqb %%mm7, %%mm3 \n\t"\ 2021 "pcmpeqb %%mm7, %%mm3 \n\t"\
2022 "pcmpeqb %%mm7, %%mm4 \n\t"\ 2022 "pcmpeqb %%mm7, %%mm4 \n\t"\
2023 "pcmpeqb %%mm7, %%mm5 \n\t"\ 2023 "pcmpeqb %%mm7, %%mm5 \n\t"\
2024 "movq %%mm3, %%mm6 \n\t"\ 2024 "movq %%mm3, %%mm6 \n\t"\
2025 "pxor %%mm4, %%mm3 \n\t"\ 2025 "pxor %%mm4, %%mm3 \n\t"\
2026 "pxor %%mm5, %%mm4 \n\t"\ 2026 "pxor %%mm5, %%mm4 \n\t"\
2027 "pxor %%mm6, %%mm5 \n\t"\ 2027 "pxor %%mm6, %%mm5 \n\t"\
2028 "por %%mm3, %%mm1 \n\t"\ 2028 "por %%mm3, %%mm1 \n\t"\
2029 "por %%mm4, %%mm2 \n\t"\ 2029 "por %%mm4, %%mm2 \n\t"\
2030 "por %%mm5, %%mm0 \n\t"\ 2030 "por %%mm5, %%mm0 \n\t"\
2031 "pand %%mm2, %%mm0 \n\t"\ 2031 "pand %%mm2, %%mm0 \n\t"\
2032 "pand %%mm1, %%mm0 \n\t"\ 2032 "pand %%mm1, %%mm0 \n\t"\
2033 "movq %%mm0, " #b " \n\t" 2033 "movq %%mm0, " #b " \n\t"
2034 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) 2034 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
2035 2035
2036 MEDIAN((%0), (%%REGa), (%%REGa, %1)) 2036 MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
2037 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) 2037 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
2038 MEDIAN((%0, %1, 4), (%%REGd), (%%REGd, %1)) 2038 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
2039 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) 2039 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
2040 2040
2041 : : "r" (src), "r" ((long)stride) 2041 : : "r" (src), "r" ((long)stride)
2042 : "%"REG_a, "%"REG_d 2042 : "%"REG_a, "%"REG_d
2043 ); 2043 );
2044 #endif //HAVE_MMX2 2044 #endif //HAVE_MMX2
2045 #else //HAVE_MMX 2045 #else //HAVE_MMX
2046 int x, y; 2046 int x, y;
2047 src+= 4*stride; 2047 src+= 4*stride;
2048 // FIXME - there should be a way to do a few columns in parallel like w/mmx 2048 // FIXME - there should be a way to do a few columns in parallel like w/mmx
2049 for(x=0; x<8; x++) 2049 for(x=0; x<8; x++)
2050 { 2050 {
2051 uint8_t *colsrc = src; 2051 uint8_t *colsrc = src;
2052 for (y=0; y<4; y++) 2052 for (y=0; y<4; y++)
2053 { 2053 {
2054 int a, b, c, d, e, f; 2054 int a, b, c, d, e, f;
2055 a = colsrc[0 ]; 2055 a = colsrc[0 ];
2056 b = colsrc[stride ]; 2056 b = colsrc[stride ];
2057 c = colsrc[stride*2]; 2057 c = colsrc[stride*2];
2058 d = (a-b)>>31; 2058 d = (a-b)>>31;
2059 e = (b-c)>>31; 2059 e = (b-c)>>31;
2060 f = (c-a)>>31; 2060 f = (c-a)>>31;
2061 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); 2061 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2062 colsrc += stride*2; 2062 colsrc += stride*2;
2063 } 2063 }
2064 src++; 2064 src++;
2065 } 2065 }
2066 #endif //HAVE_MMX 2066 #endif //HAVE_MMX
2067 } 2067 }
2068 2068
2069 #ifdef HAVE_MMX 2069 #ifdef HAVE_MMX
2070 /** 2070 /**
2071 * transposes and shift the given 8x8 Block into dst1 and dst2 2071 * transposes and shift the given 8x8 Block into dst1 and dst2
2072 */ 2072 */
2073 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 2073 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2074 { 2074 {
2075 asm( 2075 asm(
2076 "lea (%0, %1), %%"REG_a" \n\t" 2076 "lea (%0, %1), %%"REG_a" \n\t"
2077 // 0 1 2 3 4 5 6 7 8 9 2077 // 0 1 2 3 4 5 6 7 8 9
2078 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2078 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2079 "movq (%0), %%mm0 \n\t" // 12345678 2079 "movq (%0), %%mm0 \n\t" // 12345678
2080 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 2080 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
2081 "movq %%mm0, %%mm2 \n\t" // 12345678 2081 "movq %%mm0, %%mm2 \n\t" // 12345678
2082 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2082 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2083 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2083 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2084 2084
2085 "movq (%%"REG_a", %1), %%mm1 \n\t" 2085 "movq (%%"REG_a", %1), %%mm1 \n\t"
2086 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 2086 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
2087 "movq %%mm1, %%mm4 \n\t" 2087 "movq %%mm1, %%mm4 \n\t"
2088 "punpcklbw %%mm3, %%mm1 \n\t" 2088 "punpcklbw %%mm3, %%mm1 \n\t"
2089 "punpckhbw %%mm3, %%mm4 \n\t" 2089 "punpckhbw %%mm3, %%mm4 \n\t"
2090 2090
2091 "movq %%mm0, %%mm3 \n\t" 2091 "movq %%mm0, %%mm3 \n\t"
2092 "punpcklwd %%mm1, %%mm0 \n\t" 2092 "punpcklwd %%mm1, %%mm0 \n\t"
2093 "punpckhwd %%mm1, %%mm3 \n\t" 2093 "punpckhwd %%mm1, %%mm3 \n\t"
2094 "movq %%mm2, %%mm1 \n\t" 2094 "movq %%mm2, %%mm1 \n\t"
2095 "punpcklwd %%mm4, %%mm2 \n\t" 2095 "punpcklwd %%mm4, %%mm2 \n\t"
2096 "punpckhwd %%mm4, %%mm1 \n\t" 2096 "punpckhwd %%mm4, %%mm1 \n\t"
2097 2097
2098 "movd %%mm0, 128(%2) \n\t" 2098 "movd %%mm0, 128(%2) \n\t"
2099 "psrlq $32, %%mm0 \n\t" 2099 "psrlq $32, %%mm0 \n\t"
2100 "movd %%mm0, 144(%2) \n\t" 2100 "movd %%mm0, 144(%2) \n\t"
2101 "movd %%mm3, 160(%2) \n\t" 2101 "movd %%mm3, 160(%2) \n\t"
2102 "psrlq $32, %%mm3 \n\t" 2102 "psrlq $32, %%mm3 \n\t"
2103 "movd %%mm3, 176(%2) \n\t" 2103 "movd %%mm3, 176(%2) \n\t"
2104 "movd %%mm3, 48(%3) \n\t" 2104 "movd %%mm3, 48(%3) \n\t"
2105 "movd %%mm2, 192(%2) \n\t" 2105 "movd %%mm2, 192(%2) \n\t"
2106 "movd %%mm2, 64(%3) \n\t" 2106 "movd %%mm2, 64(%3) \n\t"
2107 "psrlq $32, %%mm2 \n\t" 2107 "psrlq $32, %%mm2 \n\t"
2108 "movd %%mm2, 80(%3) \n\t" 2108 "movd %%mm2, 80(%3) \n\t"
2109 "movd %%mm1, 96(%3) \n\t" 2109 "movd %%mm1, 96(%3) \n\t"
2110 "psrlq $32, %%mm1 \n\t" 2110 "psrlq $32, %%mm1 \n\t"
2111 "movd %%mm1, 112(%3) \n\t" 2111 "movd %%mm1, 112(%3) \n\t"
2112 2112
2113 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" 2113 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
2114 2114
2115 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 2115 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2116 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 2116 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
2117 "movq %%mm0, %%mm2 \n\t" // 12345678 2117 "movq %%mm0, %%mm2 \n\t" // 12345678
2118 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2118 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2119 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2119 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2120 2120
2121 "movq (%%"REG_a", %1), %%mm1 \n\t" 2121 "movq (%%"REG_a", %1), %%mm1 \n\t"
2122 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 2122 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
2123 "movq %%mm1, %%mm4 \n\t" 2123 "movq %%mm1, %%mm4 \n\t"
2124 "punpcklbw %%mm3, %%mm1 \n\t" 2124 "punpcklbw %%mm3, %%mm1 \n\t"
2125 "punpckhbw %%mm3, %%mm4 \n\t" 2125 "punpckhbw %%mm3, %%mm4 \n\t"
2126 2126
2127 "movq %%mm0, %%mm3 \n\t" 2127 "movq %%mm0, %%mm3 \n\t"
2128 "punpcklwd %%mm1, %%mm0 \n\t" 2128 "punpcklwd %%mm1, %%mm0 \n\t"
2129 "punpckhwd %%mm1, %%mm3 \n\t" 2129 "punpckhwd %%mm1, %%mm3 \n\t"
2130 "movq %%mm2, %%mm1 \n\t" 2130 "movq %%mm2, %%mm1 \n\t"
2131 "punpcklwd %%mm4, %%mm2 \n\t" 2131 "punpcklwd %%mm4, %%mm2 \n\t"
2132 "punpckhwd %%mm4, %%mm1 \n\t" 2132 "punpckhwd %%mm4, %%mm1 \n\t"
2133 2133
2134 "movd %%mm0, 132(%2) \n\t" 2134 "movd %%mm0, 132(%2) \n\t"
2135 "psrlq $32, %%mm0 \n\t" 2135 "psrlq $32, %%mm0 \n\t"
2136 "movd %%mm0, 148(%2) \n\t" 2136 "movd %%mm0, 148(%2) \n\t"
2137 "movd %%mm3, 164(%2) \n\t" 2137 "movd %%mm3, 164(%2) \n\t"
2138 "psrlq $32, %%mm3 \n\t" 2138 "psrlq $32, %%mm3 \n\t"
2139 "movd %%mm3, 180(%2) \n\t" 2139 "movd %%mm3, 180(%2) \n\t"
2140 "movd %%mm3, 52(%3) \n\t" 2140 "movd %%mm3, 52(%3) \n\t"
2141 "movd %%mm2, 196(%2) \n\t" 2141 "movd %%mm2, 196(%2) \n\t"
2142 "movd %%mm2, 68(%3) \n\t" 2142 "movd %%mm2, 68(%3) \n\t"
2143 "psrlq $32, %%mm2 \n\t" 2143 "psrlq $32, %%mm2 \n\t"
2144 "movd %%mm2, 84(%3) \n\t" 2144 "movd %%mm2, 84(%3) \n\t"
2145 "movd %%mm1, 100(%3) \n\t" 2145 "movd %%mm1, 100(%3) \n\t"
2146 "psrlq $32, %%mm1 \n\t" 2146 "psrlq $32, %%mm1 \n\t"
2147 "movd %%mm1, 116(%3) \n\t" 2147 "movd %%mm1, 116(%3) \n\t"
2148 2148
2149 2149
2150 :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2) 2150 :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
2151 : "%"REG_a 2151 : "%"REG_a
2152 ); 2152 );
2153 } 2153 }
2154 2154
2155 /** 2155 /**
2156 * transposes the given 8x8 block 2156 * transposes the given 8x8 block
2157 */ 2157 */
2158 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) 2158 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2159 { 2159 {
2160 asm( 2160 asm(
2161 "lea (%0, %1), %%"REG_a" \n\t" 2161 "lea (%0, %1), %%"REG_a" \n\t"
2162 "lea (%%"REG_a",%1,4), %%"REG_d"\n\t" 2162 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
2163 // 0 1 2 3 4 5 6 7 8 9 2163 // 0 1 2 3 4 5 6 7 8 9
2164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2165 "movq (%2), %%mm0 \n\t" // 12345678 2165 "movq (%2), %%mm0 \n\t" // 12345678
2166 "movq 16(%2), %%mm1 \n\t" // abcdefgh 2166 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2167 "movq %%mm0, %%mm2 \n\t" // 12345678 2167 "movq %%mm0, %%mm2 \n\t" // 12345678
2168 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2168 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2169 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2169 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2170 2170
2171 "movq 32(%2), %%mm1 \n\t" 2171 "movq 32(%2), %%mm1 \n\t"
2172 "movq 48(%2), %%mm3 \n\t" 2172 "movq 48(%2), %%mm3 \n\t"
2173 "movq %%mm1, %%mm4 \n\t" 2173 "movq %%mm1, %%mm4 \n\t"
2174 "punpcklbw %%mm3, %%mm1 \n\t" 2174 "punpcklbw %%mm3, %%mm1 \n\t"
2175 "punpckhbw %%mm3, %%mm4 \n\t" 2175 "punpckhbw %%mm3, %%mm4 \n\t"
2176 2176
2177 "movq %%mm0, %%mm3 \n\t" 2177 "movq %%mm0, %%mm3 \n\t"
2178 "punpcklwd %%mm1, %%mm0 \n\t" 2178 "punpcklwd %%mm1, %%mm0 \n\t"
2179 "punpckhwd %%mm1, %%mm3 \n\t" 2179 "punpckhwd %%mm1, %%mm3 \n\t"
2180 "movq %%mm2, %%mm1 \n\t" 2180 "movq %%mm2, %%mm1 \n\t"
2181 "punpcklwd %%mm4, %%mm2 \n\t" 2181 "punpcklwd %%mm4, %%mm2 \n\t"
2182 "punpckhwd %%mm4, %%mm1 \n\t" 2182 "punpckhwd %%mm4, %%mm1 \n\t"
2183 2183
2184 "movd %%mm0, (%0) \n\t" 2184 "movd %%mm0, (%0) \n\t"
2185 "psrlq $32, %%mm0 \n\t" 2185 "psrlq $32, %%mm0 \n\t"
2186 "movd %%mm0, (%%"REG_a") \n\t" 2186 "movd %%mm0, (%%"REG_a") \n\t"
2187 "movd %%mm3, (%%"REG_a", %1) \n\t" 2187 "movd %%mm3, (%%"REG_a", %1) \n\t"
2188 "psrlq $32, %%mm3 \n\t" 2188 "psrlq $32, %%mm3 \n\t"
2189 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" 2189 "movd %%mm3, (%%"REG_a", %1, 2) \n\t"
2190 "movd %%mm2, (%0, %1, 4) \n\t" 2190 "movd %%mm2, (%0, %1, 4) \n\t"
2191 "psrlq $32, %%mm2 \n\t" 2191 "psrlq $32, %%mm2 \n\t"
2192 "movd %%mm2, (%%"REG_d") \n\t" 2192 "movd %%mm2, (%%"REG_d") \n\t"
2193 "movd %%mm1, (%%"REG_d", %1) \n\t" 2193 "movd %%mm1, (%%"REG_d", %1) \n\t"
2194 "psrlq $32, %%mm1 \n\t" 2194 "psrlq $32, %%mm1 \n\t"
2195 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" 2195 "movd %%mm1, (%%"REG_d", %1, 2) \n\t"
2196 2196
2197 2197
2198 "movq 64(%2), %%mm0 \n\t" // 12345678 2198 "movq 64(%2), %%mm0 \n\t" // 12345678
2199 "movq 80(%2), %%mm1 \n\t" // abcdefgh 2199 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2200 "movq %%mm0, %%mm2 \n\t" // 12345678 2200 "movq %%mm0, %%mm2 \n\t" // 12345678
2201 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2201 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2202 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2202 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2203 2203
2204 "movq 96(%2), %%mm1 \n\t" 2204 "movq 96(%2), %%mm1 \n\t"
2205 "movq 112(%2), %%mm3 \n\t" 2205 "movq 112(%2), %%mm3 \n\t"
2206 "movq %%mm1, %%mm4 \n\t" 2206 "movq %%mm1, %%mm4 \n\t"
2207 "punpcklbw %%mm3, %%mm1 \n\t" 2207 "punpcklbw %%mm3, %%mm1 \n\t"
2208 "punpckhbw %%mm3, %%mm4 \n\t" 2208 "punpckhbw %%mm3, %%mm4 \n\t"
2209 2209
2210 "movq %%mm0, %%mm3 \n\t" 2210 "movq %%mm0, %%mm3 \n\t"
2211 "punpcklwd %%mm1, %%mm0 \n\t" 2211 "punpcklwd %%mm1, %%mm0 \n\t"
2212 "punpckhwd %%mm1, %%mm3 \n\t" 2212 "punpckhwd %%mm1, %%mm3 \n\t"
2213 "movq %%mm2, %%mm1 \n\t" 2213 "movq %%mm2, %%mm1 \n\t"
2214 "punpcklwd %%mm4, %%mm2 \n\t" 2214 "punpcklwd %%mm4, %%mm2 \n\t"
2215 "punpckhwd %%mm4, %%mm1 \n\t" 2215 "punpckhwd %%mm4, %%mm1 \n\t"
2216 2216
2217 "movd %%mm0, 4(%0) \n\t" 2217 "movd %%mm0, 4(%0) \n\t"
2218 "psrlq $32, %%mm0 \n\t" 2218 "psrlq $32, %%mm0 \n\t"
2219 "movd %%mm0, 4(%%"REG_a") \n\t" 2219 "movd %%mm0, 4(%%"REG_a") \n\t"
2220 "movd %%mm3, 4(%%"REG_a", %1) \n\t" 2220 "movd %%mm3, 4(%%"REG_a", %1) \n\t"
2221 "psrlq $32, %%mm3 \n\t" 2221 "psrlq $32, %%mm3 \n\t"
2222 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" 2222 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
2223 "movd %%mm2, 4(%0, %1, 4) \n\t" 2223 "movd %%mm2, 4(%0, %1, 4) \n\t"
2224 "psrlq $32, %%mm2 \n\t" 2224 "psrlq $32, %%mm2 \n\t"
2225 "movd %%mm2, 4(%%"REG_d") \n\t" 2225 "movd %%mm2, 4(%%"REG_d") \n\t"
2226 "movd %%mm1, 4(%%"REG_d", %1) \n\t" 2226 "movd %%mm1, 4(%%"REG_d", %1) \n\t"
2227 "psrlq $32, %%mm1 \n\t" 2227 "psrlq $32, %%mm1 \n\t"
2228 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" 2228 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
2229 2229
2230 :: "r" (dst), "r" ((long)dstStride), "r" (src) 2230 :: "r" (dst), "r" ((long)dstStride), "r" (src)
2231 : "%"REG_a, "%"REG_d 2231 : "%"REG_a, "%"REG_d
2232 ); 2232 );
2233 } 2233 }
2234 #endif //HAVE_MMX 2234 #endif //HAVE_MMX
2235 //static long test=0; 2235 //static long test=0;
2236 2236
2237 #ifndef HAVE_ALTIVEC 2237 #ifndef HAVE_ALTIVEC
2238 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 2238 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2239 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) 2239 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2240 { 2240 {
2241 // to save a register (FIXME do this outside of the loops) 2241 // to save a register (FIXME do this outside of the loops)
2242 tempBluredPast[127]= maxNoise[0]; 2242 tempBluredPast[127]= maxNoise[0];
2243 tempBluredPast[128]= maxNoise[1]; 2243 tempBluredPast[128]= maxNoise[1];
2244 tempBluredPast[129]= maxNoise[2]; 2244 tempBluredPast[129]= maxNoise[2];
2245 2245
2246 #define FAST_L2_DIFF 2246 #define FAST_L2_DIFF
2247 //#define L1_DIFF //u should change the thresholds too if u try that one 2247 //#define L1_DIFF //u should change the thresholds too if u try that one
2248 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2248 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2249 asm volatile( 2249 asm volatile(
2250 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 2250 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
2251 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 2251 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
2252 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2252 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2253 // 0 1 2 3 4 5 6 7 8 9 2253 // 0 1 2 3 4 5 6 7 8 9
2254 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 2254 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2255 //FIXME reorder? 2255 //FIXME reorder?
2256 #ifdef L1_DIFF //needs mmx2 2256 #ifdef L1_DIFF //needs mmx2
2257 "movq (%0), %%mm0 \n\t" // L0 2257 "movq (%0), %%mm0 \n\t" // L0
2258 "psadbw (%1), %%mm0 \n\t" // |L0-R0| 2258 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2259 "movq (%0, %2), %%mm1 \n\t" // L1 2259 "movq (%0, %2), %%mm1 \n\t" // L1
2260 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| 2260 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2261 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2261 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2262 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| 2262 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2263 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2263 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2264 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| 2264 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3|
2265 2265
2266 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2266 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2267 "paddw %%mm1, %%mm0 \n\t" 2267 "paddw %%mm1, %%mm0 \n\t"
2268 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| 2268 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2269 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2269 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2270 "paddw %%mm2, %%mm0 \n\t" 2270 "paddw %%mm2, %%mm0 \n\t"
2271 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| 2271 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5|
2272 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2272 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2273 "paddw %%mm3, %%mm0 \n\t" 2273 "paddw %%mm3, %%mm0 \n\t"
2274 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| 2274 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6|
2275 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2275 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2276 "paddw %%mm4, %%mm0 \n\t" 2276 "paddw %%mm4, %%mm0 \n\t"
2277 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| 2277 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7|
2278 "paddw %%mm5, %%mm6 \n\t" 2278 "paddw %%mm5, %%mm6 \n\t"
2279 "paddw %%mm7, %%mm6 \n\t" 2279 "paddw %%mm7, %%mm6 \n\t"
2280 "paddw %%mm6, %%mm0 \n\t" 2280 "paddw %%mm6, %%mm0 \n\t"
2281 #else //L1_DIFF 2281 #else //L1_DIFF
2282 #if defined (FAST_L2_DIFF) 2282 #if defined (FAST_L2_DIFF)
2283 "pcmpeqb %%mm7, %%mm7 \n\t" 2283 "pcmpeqb %%mm7, %%mm7 \n\t"
2284 "movq "MANGLE(b80)", %%mm6 \n\t" 2284 "movq "MANGLE(b80)", %%mm6 \n\t"
2285 "pxor %%mm0, %%mm0 \n\t" 2285 "pxor %%mm0, %%mm0 \n\t"
2286 #define REAL_L2_DIFF_CORE(a, b)\ 2286 #define REAL_L2_DIFF_CORE(a, b)\
2287 "movq " #a ", %%mm5 \n\t"\ 2287 "movq " #a ", %%mm5 \n\t"\
2288 "movq " #b ", %%mm2 \n\t"\ 2288 "movq " #b ", %%mm2 \n\t"\
2289 "pxor %%mm7, %%mm2 \n\t"\ 2289 "pxor %%mm7, %%mm2 \n\t"\
2290 PAVGB(%%mm2, %%mm5)\ 2290 PAVGB(%%mm2, %%mm5)\
2291 "paddb %%mm6, %%mm5 \n\t"\ 2291 "paddb %%mm6, %%mm5 \n\t"\
2292 "movq %%mm5, %%mm2 \n\t"\ 2292 "movq %%mm5, %%mm2 \n\t"\
2293 "psllw $8, %%mm5 \n\t"\ 2293 "psllw $8, %%mm5 \n\t"\
2294 "pmaddwd %%mm5, %%mm5 \n\t"\ 2294 "pmaddwd %%mm5, %%mm5 \n\t"\
2295 "pmaddwd %%mm2, %%mm2 \n\t"\ 2295 "pmaddwd %%mm2, %%mm2 \n\t"\
2296 "paddd %%mm2, %%mm5 \n\t"\ 2296 "paddd %%mm2, %%mm5 \n\t"\
2297 "psrld $14, %%mm5 \n\t"\ 2297 "psrld $14, %%mm5 \n\t"\
2298 "paddd %%mm5, %%mm0 \n\t" 2298 "paddd %%mm5, %%mm0 \n\t"
2299 2299
2300 #else //defined (FAST_L2_DIFF) 2300 #else //defined (FAST_L2_DIFF)
2301 "pxor %%mm7, %%mm7 \n\t" 2301 "pxor %%mm7, %%mm7 \n\t"
2302 "pxor %%mm0, %%mm0 \n\t" 2302 "pxor %%mm0, %%mm0 \n\t"
2303 #define REAL_L2_DIFF_CORE(a, b)\ 2303 #define REAL_L2_DIFF_CORE(a, b)\
2304 "movq " #a ", %%mm5 \n\t"\ 2304 "movq " #a ", %%mm5 \n\t"\
2305 "movq " #b ", %%mm2 \n\t"\ 2305 "movq " #b ", %%mm2 \n\t"\
2306 "movq %%mm5, %%mm1 \n\t"\ 2306 "movq %%mm5, %%mm1 \n\t"\
2307 "movq %%mm2, %%mm3 \n\t"\ 2307 "movq %%mm2, %%mm3 \n\t"\
2308 "punpcklbw %%mm7, %%mm5 \n\t"\ 2308 "punpcklbw %%mm7, %%mm5 \n\t"\
2309 "punpckhbw %%mm7, %%mm1 \n\t"\ 2309 "punpckhbw %%mm7, %%mm1 \n\t"\
2310 "punpcklbw %%mm7, %%mm2 \n\t"\ 2310 "punpcklbw %%mm7, %%mm2 \n\t"\
2311 "punpckhbw %%mm7, %%mm3 \n\t"\ 2311 "punpckhbw %%mm7, %%mm3 \n\t"\
2312 "psubw %%mm2, %%mm5 \n\t"\ 2312 "psubw %%mm2, %%mm5 \n\t"\
2313 "psubw %%mm3, %%mm1 \n\t"\ 2313 "psubw %%mm3, %%mm1 \n\t"\
2314 "pmaddwd %%mm5, %%mm5 \n\t"\ 2314 "pmaddwd %%mm5, %%mm5 \n\t"\
2315 "pmaddwd %%mm1, %%mm1 \n\t"\ 2315 "pmaddwd %%mm1, %%mm1 \n\t"\
2316 "paddd %%mm1, %%mm5 \n\t"\ 2316 "paddd %%mm1, %%mm5 \n\t"\
2317 "paddd %%mm5, %%mm0 \n\t" 2317 "paddd %%mm5, %%mm0 \n\t"
2318 2318
2319 #endif //defined (FAST_L2_DIFF) 2319 #endif //defined (FAST_L2_DIFF)
2320 2320
2321 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) 2321 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2322 2322
2323 L2_DIFF_CORE((%0), (%1)) 2323 L2_DIFF_CORE((%0) , (%1))
2324 L2_DIFF_CORE((%0, %2), (%1, %2)) 2324 L2_DIFF_CORE((%0, %2) , (%1, %2))
2325 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) 2325 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
2326 L2_DIFF_CORE((%0, %%REGa), (%1, %%REGa)) 2326 L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa))
2327 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) 2327 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
2328 L2_DIFF_CORE((%0, %%REGd), (%1, %%REGd)) 2328 L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd))
2329 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) 2329 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
2330 L2_DIFF_CORE((%0, %%REGc), (%1, %%REGc)) 2330 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
2331 2331
2332 #endif //L1_DIFF 2332 #endif //L1_DIFF
2333 2333
2334 "movq %%mm0, %%mm4 \n\t" 2334 "movq %%mm0, %%mm4 \n\t"
2335 "psrlq $32, %%mm0 \n\t" 2335 "psrlq $32, %%mm0 \n\t"
2336 "paddd %%mm0, %%mm4 \n\t" 2336 "paddd %%mm0, %%mm4 \n\t"
2337 "movd %%mm4, %%ecx \n\t" 2337 "movd %%mm4, %%ecx \n\t"
2338 "shll $2, %%ecx \n\t" 2338 "shll $2, %%ecx \n\t"
2339 "mov %3, %%"REG_d" \n\t" 2339 "mov %3, %%"REG_d" \n\t"
2340 "addl -4(%%"REG_d"), %%ecx \n\t" 2340 "addl -4(%%"REG_d"), %%ecx \n\t"
2341 "addl 4(%%"REG_d"), %%ecx \n\t" 2341 "addl 4(%%"REG_d"), %%ecx \n\t"
2342 "addl -1024(%%"REG_d"), %%ecx \n\t" 2342 "addl -1024(%%"REG_d"), %%ecx \n\t"
2343 "addl $4, %%ecx \n\t" 2343 "addl $4, %%ecx \n\t"
2344 "addl 1024(%%"REG_d"), %%ecx \n\t" 2344 "addl 1024(%%"REG_d"), %%ecx \n\t"
2345 "shrl $3, %%ecx \n\t" 2345 "shrl $3, %%ecx \n\t"
2346 "movl %%ecx, (%%"REG_d") \n\t" 2346 "movl %%ecx, (%%"REG_d") \n\t"
2347 2347
2348 // "mov %3, %%"REG_c" \n\t" 2348 // "mov %3, %%"REG_c" \n\t"
2349 // "mov %%"REG_c", test \n\t" 2349 // "mov %%"REG_c", test \n\t"
2350 // "jmp 4f \n\t" 2350 // "jmp 4f \n\t"
2351 "cmpl 512(%%"REG_d"), %%ecx \n\t" 2351 "cmpl 512(%%"REG_d"), %%ecx \n\t"
2352 " jb 2f \n\t" 2352 " jb 2f \n\t"
2353 "cmpl 516(%%"REG_d"), %%ecx \n\t" 2353 "cmpl 516(%%"REG_d"), %%ecx \n\t"
2354 " jb 1f \n\t" 2354 " jb 1f \n\t"
2355 2355
2356 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2356 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2357 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2357 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2358 "movq (%0), %%mm0 \n\t" // L0 2358 "movq (%0), %%mm0 \n\t" // L0
2359 "movq (%0, %2), %%mm1 \n\t" // L1 2359 "movq (%0, %2), %%mm1 \n\t" // L1
2360 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2360 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2361 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2361 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2362 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2362 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2363 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2363 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2364 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2364 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2365 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2365 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2366 "movq %%mm0, (%1) \n\t" // L0 2366 "movq %%mm0, (%1) \n\t" // L0
2367 "movq %%mm1, (%1, %2) \n\t" // L1 2367 "movq %%mm1, (%1, %2) \n\t" // L1
2368 "movq %%mm2, (%1, %2, 2) \n\t" // L2 2368 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2369 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 2369 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3
2370 "movq %%mm4, (%1, %2, 4) \n\t" // L4 2370 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2371 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 2371 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5
2372 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 2372 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6
2373 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 2373 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7
2374 "jmp 4f \n\t" 2374 "jmp 4f \n\t"
2375 2375
2376 "1: \n\t" 2376 "1: \n\t"
2377 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2377 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2378 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2378 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2379 "movq (%0), %%mm0 \n\t" // L0 2379 "movq (%0), %%mm0 \n\t" // L0
2380 PAVGB((%1), %%mm0) // L0 2380 PAVGB((%1), %%mm0) // L0
2381 "movq (%0, %2), %%mm1 \n\t" // L1 2381 "movq (%0, %2), %%mm1 \n\t" // L1
2382 PAVGB((%1, %2), %%mm1) // L1 2382 PAVGB((%1, %2), %%mm1) // L1
2383 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2383 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2384 PAVGB((%1, %2, 2), %%mm2) // L2 2384 PAVGB((%1, %2, 2), %%mm2) // L2
2385 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2385 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2386 PAVGB((%1, %%REGa), %%mm3) // L3 2386 PAVGB((%1, %%REGa), %%mm3) // L3
2387 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2387 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2388 PAVGB((%1, %2, 4), %%mm4) // L4 2388 PAVGB((%1, %2, 4), %%mm4) // L4
2389 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2389 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2390 PAVGB((%1, %%REGd), %%mm5) // L5 2390 PAVGB((%1, %%REGd), %%mm5) // L5
2391 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2391 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2392 PAVGB((%1, %%REGa, 2), %%mm6) // L6 2392 PAVGB((%1, %%REGa, 2), %%mm6) // L6
2393 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2393 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2394 PAVGB((%1, %%REGc), %%mm7) // L7 2394 PAVGB((%1, %%REGc), %%mm7) // L7
2395 "movq %%mm0, (%1) \n\t" // R0 2395 "movq %%mm0, (%1) \n\t" // R0
2396 "movq %%mm1, (%1, %2) \n\t" // R1 2396 "movq %%mm1, (%1, %2) \n\t" // R1
2397 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2397 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2398 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2398 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2399 "movq %%mm4, (%1, %2, 4) \n\t" // R4 2399 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2400 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 2400 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5
2401 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 2401 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6
2402 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 2402 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7
2403 "movq %%mm0, (%0) \n\t" // L0 2403 "movq %%mm0, (%0) \n\t" // L0
2404 "movq %%mm1, (%0, %2) \n\t" // L1 2404 "movq %%mm1, (%0, %2) \n\t" // L1
2405 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2405 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2406 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2406 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2407 "movq %%mm4, (%0, %2, 4) \n\t" // L4 2407 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2408 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 2408 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5
2409 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 2409 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6
2410 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 2410 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7
2411 "jmp 4f \n\t" 2411 "jmp 4f \n\t"
2412 2412
2413 "2: \n\t" 2413 "2: \n\t"
2414 "cmpl 508(%%"REG_d"), %%ecx \n\t" 2414 "cmpl 508(%%"REG_d"), %%ecx \n\t"
2415 " jb 3f \n\t" 2415 " jb 3f \n\t"
2416 2416
2417 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2417 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2418 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2418 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2419 "movq (%0), %%mm0 \n\t" // L0 2419 "movq (%0), %%mm0 \n\t" // L0
2420 "movq (%0, %2), %%mm1 \n\t" // L1 2420 "movq (%0, %2), %%mm1 \n\t" // L1
2421 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2421 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2422 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2422 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2423 "movq (%1), %%mm4 \n\t" // R0 2423 "movq (%1), %%mm4 \n\t" // R0
2424 "movq (%1, %2), %%mm5 \n\t" // R1 2424 "movq (%1, %2), %%mm5 \n\t" // R1
2425 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2425 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2426 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2426 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2427 PAVGB(%%mm4, %%mm0) 2427 PAVGB(%%mm4, %%mm0)
2428 PAVGB(%%mm5, %%mm1) 2428 PAVGB(%%mm5, %%mm1)
2429 PAVGB(%%mm6, %%mm2) 2429 PAVGB(%%mm6, %%mm2)
2430 PAVGB(%%mm7, %%mm3) 2430 PAVGB(%%mm7, %%mm3)
2431 PAVGB(%%mm4, %%mm0) 2431 PAVGB(%%mm4, %%mm0)
2432 PAVGB(%%mm5, %%mm1) 2432 PAVGB(%%mm5, %%mm1)
2433 PAVGB(%%mm6, %%mm2) 2433 PAVGB(%%mm6, %%mm2)
2434 PAVGB(%%mm7, %%mm3) 2434 PAVGB(%%mm7, %%mm3)
2435 "movq %%mm0, (%1) \n\t" // R0 2435 "movq %%mm0, (%1) \n\t" // R0
2436 "movq %%mm1, (%1, %2) \n\t" // R1 2436 "movq %%mm1, (%1, %2) \n\t" // R1
2437 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2437 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2438 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2438 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2439 "movq %%mm0, (%0) \n\t" // L0 2439 "movq %%mm0, (%0) \n\t" // L0
2440 "movq %%mm1, (%0, %2) \n\t" // L1 2440 "movq %%mm1, (%0, %2) \n\t" // L1
2441 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2441 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2442 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2442 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2443 2443
2444 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2444 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2445 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2445 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2446 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2446 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2447 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2447 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2448 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2448 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2449 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2449 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2450 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2450 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2451 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2451 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2452 PAVGB(%%mm4, %%mm0) 2452 PAVGB(%%mm4, %%mm0)
2453 PAVGB(%%mm5, %%mm1) 2453 PAVGB(%%mm5, %%mm1)
2454 PAVGB(%%mm6, %%mm2) 2454 PAVGB(%%mm6, %%mm2)
2455 PAVGB(%%mm7, %%mm3) 2455 PAVGB(%%mm7, %%mm3)
2456 PAVGB(%%mm4, %%mm0) 2456 PAVGB(%%mm4, %%mm0)
2457 PAVGB(%%mm5, %%mm1) 2457 PAVGB(%%mm5, %%mm1)
2458 PAVGB(%%mm6, %%mm2) 2458 PAVGB(%%mm6, %%mm2)
2459 PAVGB(%%mm7, %%mm3) 2459 PAVGB(%%mm7, %%mm3)
2460 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2460 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2461 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2461 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2462 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2462 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2463 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2463 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2464 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2464 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2465 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2465 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2466 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2466 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2467 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2467 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2468 "jmp 4f \n\t" 2468 "jmp 4f \n\t"
2469 2469
2470 "3: \n\t" 2470 "3: \n\t"
2471 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2471 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2472 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2472 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2473 "movq (%0), %%mm0 \n\t" // L0 2473 "movq (%0), %%mm0 \n\t" // L0
2474 "movq (%0, %2), %%mm1 \n\t" // L1 2474 "movq (%0, %2), %%mm1 \n\t" // L1
2475 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2475 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2476 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2476 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2477 "movq (%1), %%mm4 \n\t" // R0 2477 "movq (%1), %%mm4 \n\t" // R0
2478 "movq (%1, %2), %%mm5 \n\t" // R1 2478 "movq (%1, %2), %%mm5 \n\t" // R1
2479 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2479 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2480 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2480 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2481 PAVGB(%%mm4, %%mm0) 2481 PAVGB(%%mm4, %%mm0)
2482 PAVGB(%%mm5, %%mm1) 2482 PAVGB(%%mm5, %%mm1)
2483 PAVGB(%%mm6, %%mm2) 2483 PAVGB(%%mm6, %%mm2)
2484 PAVGB(%%mm7, %%mm3) 2484 PAVGB(%%mm7, %%mm3)
2485 PAVGB(%%mm4, %%mm0) 2485 PAVGB(%%mm4, %%mm0)
2486 PAVGB(%%mm5, %%mm1) 2486 PAVGB(%%mm5, %%mm1)
2487 PAVGB(%%mm6, %%mm2) 2487 PAVGB(%%mm6, %%mm2)
2488 PAVGB(%%mm7, %%mm3) 2488 PAVGB(%%mm7, %%mm3)
2489 PAVGB(%%mm4, %%mm0) 2489 PAVGB(%%mm4, %%mm0)
2490 PAVGB(%%mm5, %%mm1) 2490 PAVGB(%%mm5, %%mm1)
2491 PAVGB(%%mm6, %%mm2) 2491 PAVGB(%%mm6, %%mm2)
2492 PAVGB(%%mm7, %%mm3) 2492 PAVGB(%%mm7, %%mm3)
2493 "movq %%mm0, (%1) \n\t" // R0 2493 "movq %%mm0, (%1) \n\t" // R0
2494 "movq %%mm1, (%1, %2) \n\t" // R1 2494 "movq %%mm1, (%1, %2) \n\t" // R1
2495 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2495 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2496 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2496 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2497 "movq %%mm0, (%0) \n\t" // L0 2497 "movq %%mm0, (%0) \n\t" // L0
2498 "movq %%mm1, (%0, %2) \n\t" // L1 2498 "movq %%mm1, (%0, %2) \n\t" // L1
2499 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2499 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2500 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2500 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2501 2501
2502 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2502 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2503 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2503 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2504 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2504 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2505 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2505 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2506 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2506 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2507 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2507 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2508 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2508 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2509 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2509 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2510 PAVGB(%%mm4, %%mm0) 2510 PAVGB(%%mm4, %%mm0)
2511 PAVGB(%%mm5, %%mm1) 2511 PAVGB(%%mm5, %%mm1)
2512 PAVGB(%%mm6, %%mm2) 2512 PAVGB(%%mm6, %%mm2)
2513 PAVGB(%%mm7, %%mm3) 2513 PAVGB(%%mm7, %%mm3)
2514 PAVGB(%%mm4, %%mm0) 2514 PAVGB(%%mm4, %%mm0)
2515 PAVGB(%%mm5, %%mm1) 2515 PAVGB(%%mm5, %%mm1)
2516 PAVGB(%%mm6, %%mm2) 2516 PAVGB(%%mm6, %%mm2)
2517 PAVGB(%%mm7, %%mm3) 2517 PAVGB(%%mm7, %%mm3)
2518 PAVGB(%%mm4, %%mm0) 2518 PAVGB(%%mm4, %%mm0)
2519 PAVGB(%%mm5, %%mm1) 2519 PAVGB(%%mm5, %%mm1)
2520 PAVGB(%%mm6, %%mm2) 2520 PAVGB(%%mm6, %%mm2)
2521 PAVGB(%%mm7, %%mm3) 2521 PAVGB(%%mm7, %%mm3)
2522 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2522 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2523 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2523 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2524 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2524 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2525 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2525 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2526 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2526 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2527 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2527 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2528 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2528 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2529 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2529 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2530 2530
2531 "4: \n\t" 2531 "4: \n\t"
2532 2532
2533 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast) 2533 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
2534 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 2534 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2535 ); 2535 );
2536 //printf("%d\n", test); 2536 //printf("%d\n", test);
2537 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2537 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2538 { 2538 {
2539 int y; 2539 int y;
2540 int d=0; 2540 int d=0;
2541 // int sysd=0; 2541 // int sysd=0;
2542 int i; 2542 int i;
2543 2543
2544 for(y=0; y<8; y++) 2544 for(y=0; y<8; y++)
2545 { 2545 {
2546 int x; 2546 int x;
2547 for(x=0; x<8; x++) 2547 for(x=0; x<8; x++)
2548 { 2548 {
2549 int ref= tempBlured[ x + y*stride ]; 2549 int ref= tempBlured[ x + y*stride ];
2550 int cur= src[ x + y*stride ]; 2550 int cur= src[ x + y*stride ];
2551 int d1=ref - cur; 2551 int d1=ref - cur;
2552 // if(x==0 || x==7) d1+= d1>>1; 2552 // if(x==0 || x==7) d1+= d1>>1;
2553 // if(y==0 || y==7) d1+= d1>>1; 2553 // if(y==0 || y==7) d1+= d1>>1;
2554 // d+= ABS(d1); 2554 // d+= ABS(d1);
2555 d+= d1*d1; 2555 d+= d1*d1;
2556 // sysd+= d1; 2556 // sysd+= d1;
2557 } 2557 }
2558 } 2558 }
2559 i=d; 2559 i=d;
2560 d= ( 2560 d= (
2561 4*d 2561 4*d
2562 +(*(tempBluredPast-256)) 2562 +(*(tempBluredPast-256))
2563 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) 2563 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2564 +(*(tempBluredPast+256)) 2564 +(*(tempBluredPast+256))
2565 +4)>>3; 2565 +4)>>3;
2566 *tempBluredPast=i; 2566 *tempBluredPast=i;
2567 // ((*tempBluredPast)*3 + d + 2)>>2; 2567 // ((*tempBluredPast)*3 + d + 2)>>2;
2568 2568
2569 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); 2569 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2570 /* 2570 /*
2571 Switch between 2571 Switch between
2572 1 0 0 0 0 0 0 (0) 2572 1 0 0 0 0 0 0 (0)
2573 64 32 16 8 4 2 1 (1) 2573 64 32 16 8 4 2 1 (1)
2574 64 48 36 27 20 15 11 (33) (approx) 2574 64 48 36 27 20 15 11 (33) (approx)
2575 64 56 49 43 37 33 29 (200) (approx) 2575 64 56 49 43 37 33 29 (200) (approx)
2576 */ 2576 */
2577 if(d > maxNoise[1]) 2577 if(d > maxNoise[1])
2578 { 2578 {
2579 if(d < maxNoise[2]) 2579 if(d < maxNoise[2])
2580 { 2580 {
2581 for(y=0; y<8; y++) 2581 for(y=0; y<8; y++)
2582 { 2582 {
2583 int x; 2583 int x;
2584 for(x=0; x<8; x++) 2584 for(x=0; x<8; x++)
2585 { 2585 {
2586 int ref= tempBlured[ x + y*stride ]; 2586 int ref= tempBlured[ x + y*stride ];
2587 int cur= src[ x + y*stride ]; 2587 int cur= src[ x + y*stride ];
2588 tempBlured[ x + y*stride ]= 2588 tempBlured[ x + y*stride ]=
2589 src[ x + y*stride ]= 2589 src[ x + y*stride ]=
2590 (ref + cur + 1)>>1; 2590 (ref + cur + 1)>>1;
2591 } 2591 }
2592 } 2592 }
2593 } 2593 }
2594 else 2594 else
2595 { 2595 {
2596 for(y=0; y<8; y++) 2596 for(y=0; y<8; y++)
2597 { 2597 {
2598 int x; 2598 int x;
2599 for(x=0; x<8; x++) 2599 for(x=0; x<8; x++)
2600 { 2600 {
2601 tempBlured[ x + y*stride ]= src[ x + y*stride ]; 2601 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2602 } 2602 }
2603 } 2603 }
2604 } 2604 }
2605 } 2605 }
2606 else 2606 else
2607 { 2607 {
2608 if(d < maxNoise[0]) 2608 if(d < maxNoise[0])
2609 { 2609 {
2610 for(y=0; y<8; y++) 2610 for(y=0; y<8; y++)
2611 { 2611 {
2612 int x; 2612 int x;
2613 for(x=0; x<8; x++) 2613 for(x=0; x<8; x++)
2614 { 2614 {
2615 int ref= tempBlured[ x + y*stride ]; 2615 int ref= tempBlured[ x + y*stride ];
2616 int cur= src[ x + y*stride ]; 2616 int cur= src[ x + y*stride ];
2617 tempBlured[ x + y*stride ]= 2617 tempBlured[ x + y*stride ]=
2618 src[ x + y*stride ]= 2618 src[ x + y*stride ]=
2619 (ref*7 + cur + 4)>>3; 2619 (ref*7 + cur + 4)>>3;
2620 } 2620 }
2621 } 2621 }
2622 } 2622 }
2623 else 2623 else
2624 { 2624 {
2625 for(y=0; y<8; y++) 2625 for(y=0; y<8; y++)
2626 { 2626 {
2627 int x; 2627 int x;
2628 for(x=0; x<8; x++) 2628 for(x=0; x<8; x++)
2629 { 2629 {
2630 int ref= tempBlured[ x + y*stride ]; 2630 int ref= tempBlured[ x + y*stride ];
2631 int cur= src[ x + y*stride ]; 2631 int cur= src[ x + y*stride ];
2632 tempBlured[ x + y*stride ]= 2632 tempBlured[ x + y*stride ]=
2633 src[ x + y*stride ]= 2633 src[ x + y*stride ]=
2634 (ref*3 + cur + 2)>>2; 2634 (ref*3 + cur + 2)>>2;
2635 } 2635 }
2636 } 2636 }
2637 } 2637 }
2638 } 2638 }
2639 } 2639 }
2640 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2640 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2641 } 2641 }
2642 #endif //HAVE_ALTIVEC 2642 #endif //HAVE_ALTIVEC
2643 2643
2644 #ifdef HAVE_MMX 2644 #ifdef HAVE_MMX
2645 /** 2645 /**
2646 * accurate deblock filter 2646 * accurate deblock filter
2647 */ 2647 */
2648 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 2648 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2649 int64_t dc_mask, eq_mask, both_masks; 2649 int64_t dc_mask, eq_mask, both_masks;
2650 int64_t sums[10*8*2]; 2650 int64_t sums[10*8*2];
2651 src+= step*3; // src points to begin of the 8x8 Block 2651 src+= step*3; // src points to begin of the 8x8 Block
2652 //START_TIMER 2652 //START_TIMER
2653 asm volatile( 2653 asm volatile(
2654 "movq %0, %%mm7 \n\t" 2654 "movq %0, %%mm7 \n\t"
2655 "movq %1, %%mm6 \n\t" 2655 "movq %1, %%mm6 \n\t"
2656 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 2656 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2657 ); 2657 );
2658 2658
2659 asm volatile( 2659 asm volatile(
2660 "lea (%2, %3), %%"REG_a" \n\t" 2660 "lea (%2, %3), %%"REG_a" \n\t"
2661 // 0 1 2 3 4 5 6 7 8 9 2661 // 0 1 2 3 4 5 6 7 8 9
2662 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 2662 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2663 2663
2664 "movq (%2), %%mm0 \n\t" 2664 "movq (%2), %%mm0 \n\t"
2665 "movq (%%"REG_a"), %%mm1 \n\t" 2665 "movq (%%"REG_a"), %%mm1 \n\t"
2666 "movq %%mm1, %%mm3 \n\t" 2666 "movq %%mm1, %%mm3 \n\t"
2667 "movq %%mm1, %%mm4 \n\t" 2667 "movq %%mm1, %%mm4 \n\t"
2668 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 2668 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
2669 "paddb %%mm7, %%mm0 \n\t" 2669 "paddb %%mm7, %%mm0 \n\t"
2670 "pcmpgtb %%mm6, %%mm0 \n\t" 2670 "pcmpgtb %%mm6, %%mm0 \n\t"
2671 2671
2672 "movq (%%"REG_a",%3), %%mm2 \n\t" 2672 "movq (%%"REG_a",%3), %%mm2 \n\t"
2673 PMAXUB(%%mm2, %%mm4) 2673 PMAXUB(%%mm2, %%mm4)
2674 PMINUB(%%mm2, %%mm3, %%mm5) 2674 PMINUB(%%mm2, %%mm3, %%mm5)
2675 "psubb %%mm2, %%mm1 \n\t" 2675 "psubb %%mm2, %%mm1 \n\t"
2676 "paddb %%mm7, %%mm1 \n\t" 2676 "paddb %%mm7, %%mm1 \n\t"
2677 "pcmpgtb %%mm6, %%mm1 \n\t" 2677 "pcmpgtb %%mm6, %%mm1 \n\t"
2678 "paddb %%mm1, %%mm0 \n\t" 2678 "paddb %%mm1, %%mm0 \n\t"
2679 2679
2680 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2680 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2681 PMAXUB(%%mm1, %%mm4) 2681 PMAXUB(%%mm1, %%mm4)
2682 PMINUB(%%mm1, %%mm3, %%mm5) 2682 PMINUB(%%mm1, %%mm3, %%mm5)
2683 "psubb %%mm1, %%mm2 \n\t" 2683 "psubb %%mm1, %%mm2 \n\t"
2684 "paddb %%mm7, %%mm2 \n\t" 2684 "paddb %%mm7, %%mm2 \n\t"
2685 "pcmpgtb %%mm6, %%mm2 \n\t" 2685 "pcmpgtb %%mm6, %%mm2 \n\t"
2686 "paddb %%mm2, %%mm0 \n\t" 2686 "paddb %%mm2, %%mm0 \n\t"
2687 2687
2688 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 2688 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
2689 2689
2690 "movq (%2, %3, 4), %%mm2 \n\t" 2690 "movq (%2, %3, 4), %%mm2 \n\t"
2691 PMAXUB(%%mm2, %%mm4) 2691 PMAXUB(%%mm2, %%mm4)
2692 PMINUB(%%mm2, %%mm3, %%mm5) 2692 PMINUB(%%mm2, %%mm3, %%mm5)
2693 "psubb %%mm2, %%mm1 \n\t" 2693 "psubb %%mm2, %%mm1 \n\t"
2694 "paddb %%mm7, %%mm1 \n\t" 2694 "paddb %%mm7, %%mm1 \n\t"
2695 "pcmpgtb %%mm6, %%mm1 \n\t" 2695 "pcmpgtb %%mm6, %%mm1 \n\t"
2696 "paddb %%mm1, %%mm0 \n\t" 2696 "paddb %%mm1, %%mm0 \n\t"
2697 2697
2698 "movq (%%"REG_a"), %%mm1 \n\t" 2698 "movq (%%"REG_a"), %%mm1 \n\t"
2699 PMAXUB(%%mm1, %%mm4) 2699 PMAXUB(%%mm1, %%mm4)
2700 PMINUB(%%mm1, %%mm3, %%mm5) 2700 PMINUB(%%mm1, %%mm3, %%mm5)
2701 "psubb %%mm1, %%mm2 \n\t" 2701 "psubb %%mm1, %%mm2 \n\t"
2702 "paddb %%mm7, %%mm2 \n\t" 2702 "paddb %%mm7, %%mm2 \n\t"
2703 "pcmpgtb %%mm6, %%mm2 \n\t" 2703 "pcmpgtb %%mm6, %%mm2 \n\t"
2704 "paddb %%mm2, %%mm0 \n\t" 2704 "paddb %%mm2, %%mm0 \n\t"
2705 2705
2706 "movq (%%"REG_a", %3), %%mm2 \n\t" 2706 "movq (%%"REG_a", %3), %%mm2 \n\t"
2707 PMAXUB(%%mm2, %%mm4) 2707 PMAXUB(%%mm2, %%mm4)
2708 PMINUB(%%mm2, %%mm3, %%mm5) 2708 PMINUB(%%mm2, %%mm3, %%mm5)
2709 "psubb %%mm2, %%mm1 \n\t" 2709 "psubb %%mm2, %%mm1 \n\t"
2710 "paddb %%mm7, %%mm1 \n\t" 2710 "paddb %%mm7, %%mm1 \n\t"
2711 "pcmpgtb %%mm6, %%mm1 \n\t" 2711 "pcmpgtb %%mm6, %%mm1 \n\t"
2712 "paddb %%mm1, %%mm0 \n\t" 2712 "paddb %%mm1, %%mm0 \n\t"
2713 2713
2714 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2714 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2715 PMAXUB(%%mm1, %%mm4) 2715 PMAXUB(%%mm1, %%mm4)
2716 PMINUB(%%mm1, %%mm3, %%mm5) 2716 PMINUB(%%mm1, %%mm3, %%mm5)
2717 "psubb %%mm1, %%mm2 \n\t" 2717 "psubb %%mm1, %%mm2 \n\t"
2718 "paddb %%mm7, %%mm2 \n\t" 2718 "paddb %%mm7, %%mm2 \n\t"
2719 "pcmpgtb %%mm6, %%mm2 \n\t" 2719 "pcmpgtb %%mm6, %%mm2 \n\t"
2720 "paddb %%mm2, %%mm0 \n\t" 2720 "paddb %%mm2, %%mm0 \n\t"
2721 2721
2722 "movq (%2, %3, 8), %%mm2 \n\t" 2722 "movq (%2, %3, 8), %%mm2 \n\t"
2723 PMAXUB(%%mm2, %%mm4) 2723 PMAXUB(%%mm2, %%mm4)
2724 PMINUB(%%mm2, %%mm3, %%mm5) 2724 PMINUB(%%mm2, %%mm3, %%mm5)
2725 "psubb %%mm2, %%mm1 \n\t" 2725 "psubb %%mm2, %%mm1 \n\t"
2726 "paddb %%mm7, %%mm1 \n\t" 2726 "paddb %%mm7, %%mm1 \n\t"
2727 "pcmpgtb %%mm6, %%mm1 \n\t" 2727 "pcmpgtb %%mm6, %%mm1 \n\t"
2728 "paddb %%mm1, %%mm0 \n\t" 2728 "paddb %%mm1, %%mm0 \n\t"
2729 2729
2730 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" 2730 "movq (%%"REG_a", %3, 4), %%mm1 \n\t"
2731 "psubb %%mm1, %%mm2 \n\t" 2731 "psubb %%mm1, %%mm2 \n\t"
2732 "paddb %%mm7, %%mm2 \n\t" 2732 "paddb %%mm7, %%mm2 \n\t"
2733 "pcmpgtb %%mm6, %%mm2 \n\t" 2733 "pcmpgtb %%mm6, %%mm2 \n\t"
2734 "paddb %%mm2, %%mm0 \n\t" 2734 "paddb %%mm2, %%mm0 \n\t"
2735 "psubusb %%mm3, %%mm4 \n\t" 2735 "psubusb %%mm3, %%mm4 \n\t"
2736 2736
2737 "pxor %%mm6, %%mm6 \n\t" 2737 "pxor %%mm6, %%mm6 \n\t"
2738 "movq %4, %%mm7 \n\t" // QP,..., QP 2738 "movq %4, %%mm7 \n\t" // QP,..., QP
2739 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 2739 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2740 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 2740 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2741 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2741 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2742 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2742 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2743 "movq %%mm7, %1 \n\t" 2743 "movq %%mm7, %1 \n\t"
2744 2744
2745 "movq %5, %%mm7 \n\t" 2745 "movq %5, %%mm7 \n\t"
2746 "punpcklbw %%mm7, %%mm7 \n\t" 2746 "punpcklbw %%mm7, %%mm7 \n\t"
2747 "punpcklbw %%mm7, %%mm7 \n\t" 2747 "punpcklbw %%mm7, %%mm7 \n\t"
2748 "punpcklbw %%mm7, %%mm7 \n\t" 2748 "punpcklbw %%mm7, %%mm7 \n\t"
2749 "psubb %%mm0, %%mm6 \n\t" 2749 "psubb %%mm0, %%mm6 \n\t"
2750 "pcmpgtb %%mm7, %%mm6 \n\t" 2750 "pcmpgtb %%mm7, %%mm6 \n\t"
2751 "movq %%mm6, %0 \n\t" 2751 "movq %%mm6, %0 \n\t"
2752 2752
2753 : "=m" (eq_mask), "=m" (dc_mask) 2753 : "=m" (eq_mask), "=m" (dc_mask)
2754 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) 2754 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2755 : "%"REG_a 2755 : "%"REG_a
2756 ); 2756 );
2757 2757
2758 both_masks = dc_mask & eq_mask; 2758 both_masks = dc_mask & eq_mask;
2759 2759
2760 if(both_masks){ 2760 if(both_masks){
2761 long offset= -8*step; 2761 long offset= -8*step;
2762 int64_t *temp_sums= sums; 2762 int64_t *temp_sums= sums;
2763 2763
2764 asm volatile( 2764 asm volatile(
2765 "movq %2, %%mm0 \n\t" // QP,..., QP 2765 "movq %2, %%mm0 \n\t" // QP,..., QP
2766 "pxor %%mm4, %%mm4 \n\t" 2766 "pxor %%mm4, %%mm4 \n\t"
2767 2767
2768 "movq (%0), %%mm6 \n\t" 2768 "movq (%0), %%mm6 \n\t"
2769 "movq (%0, %1), %%mm5 \n\t" 2769 "movq (%0, %1), %%mm5 \n\t"
2770 "movq %%mm5, %%mm1 \n\t" 2770 "movq %%mm5, %%mm1 \n\t"
2771 "movq %%mm6, %%mm2 \n\t" 2771 "movq %%mm6, %%mm2 \n\t"
2772 "psubusb %%mm6, %%mm5 \n\t" 2772 "psubusb %%mm6, %%mm5 \n\t"
2773 "psubusb %%mm1, %%mm2 \n\t" 2773 "psubusb %%mm1, %%mm2 \n\t"
2774 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2774 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2775 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2775 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2776 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2776 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2777 2777
2778 "pxor %%mm6, %%mm1 \n\t" 2778 "pxor %%mm6, %%mm1 \n\t"
2779 "pand %%mm0, %%mm1 \n\t" 2779 "pand %%mm0, %%mm1 \n\t"
2780 "pxor %%mm1, %%mm6 \n\t" 2780 "pxor %%mm1, %%mm6 \n\t"
2781 // 0:QP 6:First 2781 // 0:QP 6:First
2782 2782
2783 "movq (%0, %1, 8), %%mm5 \n\t" 2783 "movq (%0, %1, 8), %%mm5 \n\t"
2784 "add %1, %0 \n\t" // %0 points to line 1 not 0 2784 "add %1, %0 \n\t" // %0 points to line 1 not 0
2785 "movq (%0, %1, 8), %%mm7 \n\t" 2785 "movq (%0, %1, 8), %%mm7 \n\t"
2786 "movq %%mm5, %%mm1 \n\t" 2786 "movq %%mm5, %%mm1 \n\t"
2787 "movq %%mm7, %%mm2 \n\t" 2787 "movq %%mm7, %%mm2 \n\t"
2788 "psubusb %%mm7, %%mm5 \n\t" 2788 "psubusb %%mm7, %%mm5 \n\t"
2789 "psubusb %%mm1, %%mm2 \n\t" 2789 "psubusb %%mm1, %%mm2 \n\t"
2790 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2790 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2791 "movq %2, %%mm0 \n\t" // QP,..., QP 2791 "movq %2, %%mm0 \n\t" // QP,..., QP
2792 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2792 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2793 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2793 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2794 2794
2795 "pxor %%mm7, %%mm1 \n\t" 2795 "pxor %%mm7, %%mm1 \n\t"
2796 "pand %%mm0, %%mm1 \n\t" 2796 "pand %%mm0, %%mm1 \n\t"
2797 "pxor %%mm1, %%mm7 \n\t" 2797 "pxor %%mm1, %%mm7 \n\t"
2798 2798
2799 "movq %%mm6, %%mm5 \n\t" 2799 "movq %%mm6, %%mm5 \n\t"
2800 "punpckhbw %%mm4, %%mm6 \n\t" 2800 "punpckhbw %%mm4, %%mm6 \n\t"
2801 "punpcklbw %%mm4, %%mm5 \n\t" 2801 "punpcklbw %%mm4, %%mm5 \n\t"
2802 // 4:0 5/6:First 7:Last 2802 // 4:0 5/6:First 7:Last
2803 2803
2804 "movq %%mm5, %%mm0 \n\t" 2804 "movq %%mm5, %%mm0 \n\t"
2805 "movq %%mm6, %%mm1 \n\t" 2805 "movq %%mm6, %%mm1 \n\t"
2806 "psllw $2, %%mm0 \n\t" 2806 "psllw $2, %%mm0 \n\t"
2807 "psllw $2, %%mm1 \n\t" 2807 "psllw $2, %%mm1 \n\t"
2808 "paddw "MANGLE(w04)", %%mm0 \n\t" 2808 "paddw "MANGLE(w04)", %%mm0 \n\t"
2809 "paddw "MANGLE(w04)", %%mm1 \n\t" 2809 "paddw "MANGLE(w04)", %%mm1 \n\t"
2810 2810
2811 #define NEXT\ 2811 #define NEXT\
2812 "movq (%0), %%mm2 \n\t"\ 2812 "movq (%0), %%mm2 \n\t"\
2813 "movq (%0), %%mm3 \n\t"\ 2813 "movq (%0), %%mm3 \n\t"\
2814 "add %1, %0 \n\t"\ 2814 "add %1, %0 \n\t"\
2815 "punpcklbw %%mm4, %%mm2 \n\t"\ 2815 "punpcklbw %%mm4, %%mm2 \n\t"\
2816 "punpckhbw %%mm4, %%mm3 \n\t"\ 2816 "punpckhbw %%mm4, %%mm3 \n\t"\
2817 "paddw %%mm2, %%mm0 \n\t"\ 2817 "paddw %%mm2, %%mm0 \n\t"\
2818 "paddw %%mm3, %%mm1 \n\t" 2818 "paddw %%mm3, %%mm1 \n\t"
2819 2819
2820 #define PREV\ 2820 #define PREV\
2821 "movq (%0), %%mm2 \n\t"\ 2821 "movq (%0), %%mm2 \n\t"\
2822 "movq (%0), %%mm3 \n\t"\ 2822 "movq (%0), %%mm3 \n\t"\
2823 "add %1, %0 \n\t"\ 2823 "add %1, %0 \n\t"\
2824 "punpcklbw %%mm4, %%mm2 \n\t"\ 2824 "punpcklbw %%mm4, %%mm2 \n\t"\
2825 "punpckhbw %%mm4, %%mm3 \n\t"\ 2825 "punpckhbw %%mm4, %%mm3 \n\t"\
2826 "psubw %%mm2, %%mm0 \n\t"\ 2826 "psubw %%mm2, %%mm0 \n\t"\
2827 "psubw %%mm3, %%mm1 \n\t" 2827 "psubw %%mm3, %%mm1 \n\t"
2828 2828
2829 2829
2830 NEXT //0 2830 NEXT //0
2831 NEXT //1 2831 NEXT //1
2832 NEXT //2 2832 NEXT //2
2833 "movq %%mm0, (%3) \n\t" 2833 "movq %%mm0, (%3) \n\t"
2834 "movq %%mm1, 8(%3) \n\t" 2834 "movq %%mm1, 8(%3) \n\t"
2835 2835
2836 NEXT //3 2836 NEXT //3
2837 "psubw %%mm5, %%mm0 \n\t" 2837 "psubw %%mm5, %%mm0 \n\t"
2838 "psubw %%mm6, %%mm1 \n\t" 2838 "psubw %%mm6, %%mm1 \n\t"
2839 "movq %%mm0, 16(%3) \n\t" 2839 "movq %%mm0, 16(%3) \n\t"
2840 "movq %%mm1, 24(%3) \n\t" 2840 "movq %%mm1, 24(%3) \n\t"
2841 2841
2842 NEXT //4 2842 NEXT //4
2843 "psubw %%mm5, %%mm0 \n\t" 2843 "psubw %%mm5, %%mm0 \n\t"
2844 "psubw %%mm6, %%mm1 \n\t" 2844 "psubw %%mm6, %%mm1 \n\t"
2845 "movq %%mm0, 32(%3) \n\t" 2845 "movq %%mm0, 32(%3) \n\t"
2846 "movq %%mm1, 40(%3) \n\t" 2846 "movq %%mm1, 40(%3) \n\t"
2847 2847
2848 NEXT //5 2848 NEXT //5
2849 "psubw %%mm5, %%mm0 \n\t" 2849 "psubw %%mm5, %%mm0 \n\t"
2850 "psubw %%mm6, %%mm1 \n\t" 2850 "psubw %%mm6, %%mm1 \n\t"
2851 "movq %%mm0, 48(%3) \n\t" 2851 "movq %%mm0, 48(%3) \n\t"
2852 "movq %%mm1, 56(%3) \n\t" 2852 "movq %%mm1, 56(%3) \n\t"
2853 2853
2854 NEXT //6 2854 NEXT //6
2855 "psubw %%mm5, %%mm0 \n\t" 2855 "psubw %%mm5, %%mm0 \n\t"
2856 "psubw %%mm6, %%mm1 \n\t" 2856 "psubw %%mm6, %%mm1 \n\t"
2857 "movq %%mm0, 64(%3) \n\t" 2857 "movq %%mm0, 64(%3) \n\t"
2858 "movq %%mm1, 72(%3) \n\t" 2858 "movq %%mm1, 72(%3) \n\t"
2859 2859
2860 "movq %%mm7, %%mm6 \n\t" 2860 "movq %%mm7, %%mm6 \n\t"
2861 "punpckhbw %%mm4, %%mm7 \n\t" 2861 "punpckhbw %%mm4, %%mm7 \n\t"
2862 "punpcklbw %%mm4, %%mm6 \n\t" 2862 "punpcklbw %%mm4, %%mm6 \n\t"
2863 2863
2864 NEXT //7 2864 NEXT //7
2865 "mov %4, %0 \n\t" 2865 "mov %4, %0 \n\t"
2866 "add %1, %0 \n\t" 2866 "add %1, %0 \n\t"
2867 PREV //0 2867 PREV //0
2868 "movq %%mm0, 80(%3) \n\t" 2868 "movq %%mm0, 80(%3) \n\t"
2869 "movq %%mm1, 88(%3) \n\t" 2869 "movq %%mm1, 88(%3) \n\t"
2870 2870
2871 PREV //1 2871 PREV //1
2872 "paddw %%mm6, %%mm0 \n\t" 2872 "paddw %%mm6, %%mm0 \n\t"
2873 "paddw %%mm7, %%mm1 \n\t" 2873 "paddw %%mm7, %%mm1 \n\t"
2874 "movq %%mm0, 96(%3) \n\t" 2874 "movq %%mm0, 96(%3) \n\t"
2875 "movq %%mm1, 104(%3) \n\t" 2875 "movq %%mm1, 104(%3) \n\t"
2876 2876
2877 PREV //2 2877 PREV //2
2878 "paddw %%mm6, %%mm0 \n\t" 2878 "paddw %%mm6, %%mm0 \n\t"
2879 "paddw %%mm7, %%mm1 \n\t" 2879 "paddw %%mm7, %%mm1 \n\t"
2880 "movq %%mm0, 112(%3) \n\t" 2880 "movq %%mm0, 112(%3) \n\t"
2881 "movq %%mm1, 120(%3) \n\t" 2881 "movq %%mm1, 120(%3) \n\t"
2882 2882
2883 PREV //3 2883 PREV //3
2884 "paddw %%mm6, %%mm0 \n\t" 2884 "paddw %%mm6, %%mm0 \n\t"
2885 "paddw %%mm7, %%mm1 \n\t" 2885 "paddw %%mm7, %%mm1 \n\t"
2886 "movq %%mm0, 128(%3) \n\t" 2886 "movq %%mm0, 128(%3) \n\t"
2887 "movq %%mm1, 136(%3) \n\t" 2887 "movq %%mm1, 136(%3) \n\t"
2888 2888
2889 PREV //4 2889 PREV //4
2890 "paddw %%mm6, %%mm0 \n\t" 2890 "paddw %%mm6, %%mm0 \n\t"
2891 "paddw %%mm7, %%mm1 \n\t" 2891 "paddw %%mm7, %%mm1 \n\t"
2892 "movq %%mm0, 144(%3) \n\t" 2892 "movq %%mm0, 144(%3) \n\t"
2893 "movq %%mm1, 152(%3) \n\t" 2893 "movq %%mm1, 152(%3) \n\t"
2894 2894
2895 "mov %4, %0 \n\t" //FIXME 2895 "mov %4, %0 \n\t" //FIXME
2896 2896
2897 : "+&r"(src) 2897 : "+&r"(src)
2898 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src) 2898 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
2899 ); 2899 );
2900 2900
2901 src+= step; // src points to begin of the 8x8 Block 2901 src+= step; // src points to begin of the 8x8 Block
2902 2902
2903 asm volatile( 2903 asm volatile(
2904 "movq %4, %%mm6 \n\t" 2904 "movq %4, %%mm6 \n\t"
2905 "pcmpeqb %%mm5, %%mm5 \n\t" 2905 "pcmpeqb %%mm5, %%mm5 \n\t"
2906 "pxor %%mm6, %%mm5 \n\t" 2906 "pxor %%mm6, %%mm5 \n\t"
2907 "pxor %%mm7, %%mm7 \n\t" 2907 "pxor %%mm7, %%mm7 \n\t"
2908 2908
2909 "1: \n\t" 2909 "1: \n\t"
2910 "movq (%1), %%mm0 \n\t" 2910 "movq (%1), %%mm0 \n\t"
2911 "movq 8(%1), %%mm1 \n\t" 2911 "movq 8(%1), %%mm1 \n\t"
2912 "paddw 32(%1), %%mm0 \n\t" 2912 "paddw 32(%1), %%mm0 \n\t"
2913 "paddw 40(%1), %%mm1 \n\t" 2913 "paddw 40(%1), %%mm1 \n\t"
2914 "movq (%0, %3), %%mm2 \n\t" 2914 "movq (%0, %3), %%mm2 \n\t"
2915 "movq %%mm2, %%mm3 \n\t" 2915 "movq %%mm2, %%mm3 \n\t"
2916 "movq %%mm2, %%mm4 \n\t" 2916 "movq %%mm2, %%mm4 \n\t"
2917 "punpcklbw %%mm7, %%mm2 \n\t" 2917 "punpcklbw %%mm7, %%mm2 \n\t"
2918 "punpckhbw %%mm7, %%mm3 \n\t" 2918 "punpckhbw %%mm7, %%mm3 \n\t"
2919 "paddw %%mm2, %%mm0 \n\t" 2919 "paddw %%mm2, %%mm0 \n\t"
2920 "paddw %%mm3, %%mm1 \n\t" 2920 "paddw %%mm3, %%mm1 \n\t"
2921 "paddw %%mm2, %%mm0 \n\t" 2921 "paddw %%mm2, %%mm0 \n\t"
2922 "paddw %%mm3, %%mm1 \n\t" 2922 "paddw %%mm3, %%mm1 \n\t"
2923 "psrlw $4, %%mm0 \n\t" 2923 "psrlw $4, %%mm0 \n\t"
2924 "psrlw $4, %%mm1 \n\t" 2924 "psrlw $4, %%mm1 \n\t"
2925 "packuswb %%mm1, %%mm0 \n\t" 2925 "packuswb %%mm1, %%mm0 \n\t"
2926 "pand %%mm6, %%mm0 \n\t" 2926 "pand %%mm6, %%mm0 \n\t"
2927 "pand %%mm5, %%mm4 \n\t" 2927 "pand %%mm5, %%mm4 \n\t"
2928 "por %%mm4, %%mm0 \n\t" 2928 "por %%mm4, %%mm0 \n\t"
2929 "movq %%mm0, (%0, %3) \n\t" 2929 "movq %%mm0, (%0, %3) \n\t"
2930 "add $16, %1 \n\t" 2930 "add $16, %1 \n\t"
2931 "add %2, %0 \n\t" 2931 "add %2, %0 \n\t"
2932 " js 1b \n\t" 2932 " js 1b \n\t"
2933 2933
2934 : "+r"(offset), "+r"(temp_sums) 2934 : "+r"(offset), "+r"(temp_sums)
2935 : "r" ((long)step), "r"(src - offset), "m"(both_masks) 2935 : "r" ((long)step), "r"(src - offset), "m"(both_masks)
2936 ); 2936 );
2937 }else 2937 }else
2938 src+= step; // src points to begin of the 8x8 Block 2938 src+= step; // src points to begin of the 8x8 Block
2939 2939
2940 if(eq_mask != -1LL){ 2940 if(eq_mask != -1LL){
2941 uint8_t *temp_src= src; 2941 uint8_t *temp_src= src;
2942 asm volatile( 2942 asm volatile(
2943 "pxor %%mm7, %%mm7 \n\t" 2943 "pxor %%mm7, %%mm7 \n\t"
2944 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 2944 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
2945 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 2945 "and "ALIGN_MASK", %%"REG_c" \n\t" // align
2946 // 0 1 2 3 4 5 6 7 8 9 2946 // 0 1 2 3 4 5 6 7 8 9
2947 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 2947 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2948 2948
2949 "movq (%0), %%mm0 \n\t" 2949 "movq (%0), %%mm0 \n\t"
2950 "movq %%mm0, %%mm1 \n\t" 2950 "movq %%mm0, %%mm1 \n\t"
2951 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 2951 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2952 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 2952 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2953 2953
2954 "movq (%0, %1), %%mm2 \n\t" 2954 "movq (%0, %1), %%mm2 \n\t"
2955 "lea (%0, %1, 2), %%"REG_a" \n\t" 2955 "lea (%0, %1, 2), %%"REG_a" \n\t"
2956 "movq %%mm2, %%mm3 \n\t" 2956 "movq %%mm2, %%mm3 \n\t"
2957 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 2957 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2958 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 2958 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2959 2959
2960 "movq (%%"REG_a"), %%mm4 \n\t" 2960 "movq (%%"REG_a"), %%mm4 \n\t"
2961 "movq %%mm4, %%mm5 \n\t" 2961 "movq %%mm4, %%mm5 \n\t"
2962 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 2962 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2963 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 2963 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2964 2964
2965 "paddw %%mm0, %%mm0 \n\t" // 2L0 2965 "paddw %%mm0, %%mm0 \n\t" // 2L0
2966 "paddw %%mm1, %%mm1 \n\t" // 2H0 2966 "paddw %%mm1, %%mm1 \n\t" // 2H0
2967 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 2967 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2968 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 2968 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2969 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 2969 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2970 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 2970 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2971 2971
2972 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 2972 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2973 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 2973 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2974 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 2974 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2975 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 2975 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2976 2976
2977 "movq (%%"REG_a", %1), %%mm2 \n\t" 2977 "movq (%%"REG_a", %1), %%mm2 \n\t"
2978 "movq %%mm2, %%mm3 \n\t" 2978 "movq %%mm2, %%mm3 \n\t"
2979 "punpcklbw %%mm7, %%mm2 \n\t" // L3 2979 "punpcklbw %%mm7, %%mm2 \n\t" // L3
2980 "punpckhbw %%mm7, %%mm3 \n\t" // H3 2980 "punpckhbw %%mm7, %%mm3 \n\t" // H3
2981 2981
2982 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 2982 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2983 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 2983 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2984 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2984 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2985 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2985 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2986 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2986 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2987 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2987 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2988 2988
2989 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 2989 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
2990 "movq %%mm0, %%mm1 \n\t" 2990 "movq %%mm0, %%mm1 \n\t"
2991 "punpcklbw %%mm7, %%mm0 \n\t" // L4 2991 "punpcklbw %%mm7, %%mm0 \n\t" // L4
2992 "punpckhbw %%mm7, %%mm1 \n\t" // H4 2992 "punpckhbw %%mm7, %%mm1 \n\t" // H4
2993 2993
2994 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 2994 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2995 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 2995 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2996 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 2996 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
2997 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 2997 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
2998 "paddw %%mm4, %%mm4 \n\t" // 2L2 2998 "paddw %%mm4, %%mm4 \n\t" // 2L2
2999 "paddw %%mm5, %%mm5 \n\t" // 2H2 2999 "paddw %%mm5, %%mm5 \n\t" // 2H2
3000 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 3000 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
3001 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 3001 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
3002 3002
3003 "lea (%%"REG_a", %1), %0 \n\t" 3003 "lea (%%"REG_a", %1), %0 \n\t"
3004 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 3004 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
3005 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 3005 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
3006 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 3006 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
3007 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 3007 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
3008 //50 opcodes so far 3008 //50 opcodes so far
3009 "movq (%0, %1, 2), %%mm2 \n\t" 3009 "movq (%0, %1, 2), %%mm2 \n\t"
3010 "movq %%mm2, %%mm3 \n\t" 3010 "movq %%mm2, %%mm3 \n\t"
3011 "punpcklbw %%mm7, %%mm2 \n\t" // L5 3011 "punpcklbw %%mm7, %%mm2 \n\t" // L5
3012 "punpckhbw %%mm7, %%mm3 \n\t" // H5 3012 "punpckhbw %%mm7, %%mm3 \n\t" // H5
3013 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 3013 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
3014 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 3014 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
3015 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 3015 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
3016 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 3016 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
3017 3017
3018 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 3018 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
3019 "punpcklbw %%mm7, %%mm6 \n\t" // L6 3019 "punpcklbw %%mm7, %%mm6 \n\t" // L6
3020 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 3020 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
3021 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 3021 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
3022 "punpckhbw %%mm7, %%mm6 \n\t" // H6 3022 "punpckhbw %%mm7, %%mm6 \n\t" // H6
3023 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 3023 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
3024 3024
3025 "paddw %%mm0, %%mm0 \n\t" // 2L4 3025 "paddw %%mm0, %%mm0 \n\t" // 2L4
3026 "paddw %%mm1, %%mm1 \n\t" // 2H4 3026 "paddw %%mm1, %%mm1 \n\t" // 2H4
3027 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 3027 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
3028 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 3028 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
3029 3029
3030 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 3030 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
3031 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 3031 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
3032 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 3032 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
3033 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 3033 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
3034 3034
3035 "movq (%0, %1, 4), %%mm2 \n\t" 3035 "movq (%0, %1, 4), %%mm2 \n\t"
3036 "movq %%mm2, %%mm3 \n\t" 3036 "movq %%mm2, %%mm3 \n\t"
3037 "punpcklbw %%mm7, %%mm2 \n\t" // L7 3037 "punpcklbw %%mm7, %%mm2 \n\t" // L7
3038 "punpckhbw %%mm7, %%mm3 \n\t" // H7 3038 "punpckhbw %%mm7, %%mm3 \n\t" // H7
3039 3039
3040 "paddw %%mm2, %%mm2 \n\t" // 2L7 3040 "paddw %%mm2, %%mm2 \n\t" // 2L7
3041 "paddw %%mm3, %%mm3 \n\t" // 2H7 3041 "paddw %%mm3, %%mm3 \n\t" // 2H7
3042 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 3042 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3043 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 3043 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3044 3044
3045 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 3045 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3046 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 3046 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3047 3047
3048 #ifdef HAVE_MMX2 3048 #ifdef HAVE_MMX2
3049 "movq %%mm7, %%mm6 \n\t" // 0 3049 "movq %%mm7, %%mm6 \n\t" // 0
3050 "psubw %%mm0, %%mm6 \n\t" 3050 "psubw %%mm0, %%mm6 \n\t"
3051 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 3051 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3052 "movq %%mm7, %%mm6 \n\t" // 0 3052 "movq %%mm7, %%mm6 \n\t" // 0
3053 "psubw %%mm1, %%mm6 \n\t" 3053 "psubw %%mm1, %%mm6 \n\t"
3054 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 3054 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3055 "movq %%mm7, %%mm6 \n\t" // 0 3055 "movq %%mm7, %%mm6 \n\t" // 0
3056 "psubw %%mm2, %%mm6 \n\t" 3056 "psubw %%mm2, %%mm6 \n\t"
3057 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 3057 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3058 "movq %%mm7, %%mm6 \n\t" // 0 3058 "movq %%mm7, %%mm6 \n\t" // 0
3059 "psubw %%mm3, %%mm6 \n\t" 3059 "psubw %%mm3, %%mm6 \n\t"
3060 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 3060 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3061 #else 3061 #else
3062 "movq %%mm7, %%mm6 \n\t" // 0 3062 "movq %%mm7, %%mm6 \n\t" // 0
3063 "pcmpgtw %%mm0, %%mm6 \n\t" 3063 "pcmpgtw %%mm0, %%mm6 \n\t"
3064 "pxor %%mm6, %%mm0 \n\t" 3064 "pxor %%mm6, %%mm0 \n\t"
3065 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 3065 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3066 "movq %%mm7, %%mm6 \n\t" // 0 3066 "movq %%mm7, %%mm6 \n\t" // 0
3067 "pcmpgtw %%mm1, %%mm6 \n\t" 3067 "pcmpgtw %%mm1, %%mm6 \n\t"
3068 "pxor %%mm6, %%mm1 \n\t" 3068 "pxor %%mm6, %%mm1 \n\t"
3069 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 3069 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3070 "movq %%mm7, %%mm6 \n\t" // 0 3070 "movq %%mm7, %%mm6 \n\t" // 0
3071 "pcmpgtw %%mm2, %%mm6 \n\t" 3071 "pcmpgtw %%mm2, %%mm6 \n\t"
3072 "pxor %%mm6, %%mm2 \n\t" 3072 "pxor %%mm6, %%mm2 \n\t"
3073 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 3073 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3074 "movq %%mm7, %%mm6 \n\t" // 0 3074 "movq %%mm7, %%mm6 \n\t" // 0
3075 "pcmpgtw %%mm3, %%mm6 \n\t" 3075 "pcmpgtw %%mm3, %%mm6 \n\t"
3076 "pxor %%mm6, %%mm3 \n\t" 3076 "pxor %%mm6, %%mm3 \n\t"
3077 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 3077 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3078 #endif 3078 #endif
3079 3079
3080 #ifdef HAVE_MMX2 3080 #ifdef HAVE_MMX2
3081 "pminsw %%mm2, %%mm0 \n\t" 3081 "pminsw %%mm2, %%mm0 \n\t"
3082 "pminsw %%mm3, %%mm1 \n\t" 3082 "pminsw %%mm3, %%mm1 \n\t"
3083 #else 3083 #else
3084 "movq %%mm0, %%mm6 \n\t" 3084 "movq %%mm0, %%mm6 \n\t"
3085 "psubusw %%mm2, %%mm6 \n\t" 3085 "psubusw %%mm2, %%mm6 \n\t"
3086 "psubw %%mm6, %%mm0 \n\t" 3086 "psubw %%mm6, %%mm0 \n\t"
3087 "movq %%mm1, %%mm6 \n\t" 3087 "movq %%mm1, %%mm6 \n\t"
3088 "psubusw %%mm3, %%mm6 \n\t" 3088 "psubusw %%mm3, %%mm6 \n\t"
3089 "psubw %%mm6, %%mm1 \n\t" 3089 "psubw %%mm6, %%mm1 \n\t"
3090 #endif 3090 #endif
3091 3091
3092 "movd %2, %%mm2 \n\t" // QP 3092 "movd %2, %%mm2 \n\t" // QP
3093 "punpcklbw %%mm7, %%mm2 \n\t" 3093 "punpcklbw %%mm7, %%mm2 \n\t"
3094 3094
3095 "movq %%mm7, %%mm6 \n\t" // 0 3095 "movq %%mm7, %%mm6 \n\t" // 0
3096 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 3096 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3097 "pxor %%mm6, %%mm4 \n\t" 3097 "pxor %%mm6, %%mm4 \n\t"
3098 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 3098 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3099 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 3099 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3100 "pxor %%mm7, %%mm5 \n\t" 3100 "pxor %%mm7, %%mm5 \n\t"
3101 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 3101 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3102 // 100 opcodes 3102 // 100 opcodes
3103 "psllw $3, %%mm2 \n\t" // 8QP 3103 "psllw $3, %%mm2 \n\t" // 8QP
3104 "movq %%mm2, %%mm3 \n\t" // 8QP 3104 "movq %%mm2, %%mm3 \n\t" // 8QP
3105 "pcmpgtw %%mm4, %%mm2 \n\t" 3105 "pcmpgtw %%mm4, %%mm2 \n\t"
3106 "pcmpgtw %%mm5, %%mm3 \n\t" 3106 "pcmpgtw %%mm5, %%mm3 \n\t"
3107 "pand %%mm2, %%mm4 \n\t" 3107 "pand %%mm2, %%mm4 \n\t"
3108 "pand %%mm3, %%mm5 \n\t" 3108 "pand %%mm3, %%mm5 \n\t"
3109 3109
3110 3110
3111 "psubusw %%mm0, %%mm4 \n\t" // hd 3111 "psubusw %%mm0, %%mm4 \n\t" // hd
3112 "psubusw %%mm1, %%mm5 \n\t" // ld 3112 "psubusw %%mm1, %%mm5 \n\t" // ld
3113 3113
3114 3114
3115 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 3115 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3116 "pmullw %%mm2, %%mm4 \n\t" 3116 "pmullw %%mm2, %%mm4 \n\t"
3117 "pmullw %%mm2, %%mm5 \n\t" 3117 "pmullw %%mm2, %%mm5 \n\t"
3118 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 3118 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3119 "paddw %%mm2, %%mm4 \n\t" 3119 "paddw %%mm2, %%mm4 \n\t"
3120 "paddw %%mm2, %%mm5 \n\t" 3120 "paddw %%mm2, %%mm5 \n\t"
3121 "psrlw $6, %%mm4 \n\t" 3121 "psrlw $6, %%mm4 \n\t"
3122 "psrlw $6, %%mm5 \n\t" 3122 "psrlw $6, %%mm5 \n\t"
3123 3123
3124 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 3124 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
3125 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 3125 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
3126 3126
3127 "pxor %%mm2, %%mm2 \n\t" 3127 "pxor %%mm2, %%mm2 \n\t"
3128 "pxor %%mm3, %%mm3 \n\t" 3128 "pxor %%mm3, %%mm3 \n\t"
3129 3129
3130 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 3130 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3131 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 3131 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3132 "pxor %%mm2, %%mm0 \n\t" 3132 "pxor %%mm2, %%mm0 \n\t"
3133 "pxor %%mm3, %%mm1 \n\t" 3133 "pxor %%mm3, %%mm1 \n\t"
3134 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 3134 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3135 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 3135 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3136 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 3136 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3137 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 3137 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3138 3138
3139 "pxor %%mm6, %%mm2 \n\t" 3139 "pxor %%mm6, %%mm2 \n\t"
3140 "pxor %%mm7, %%mm3 \n\t" 3140 "pxor %%mm7, %%mm3 \n\t"
3141 "pand %%mm2, %%mm4 \n\t" 3141 "pand %%mm2, %%mm4 \n\t"
3142 "pand %%mm3, %%mm5 \n\t" 3142 "pand %%mm3, %%mm5 \n\t"
3143 3143
3144 #ifdef HAVE_MMX2 3144 #ifdef HAVE_MMX2
3145 "pminsw %%mm0, %%mm4 \n\t" 3145 "pminsw %%mm0, %%mm4 \n\t"
3146 "pminsw %%mm1, %%mm5 \n\t" 3146 "pminsw %%mm1, %%mm5 \n\t"
3147 #else 3147 #else
3148 "movq %%mm4, %%mm2 \n\t" 3148 "movq %%mm4, %%mm2 \n\t"
3149 "psubusw %%mm0, %%mm2 \n\t" 3149 "psubusw %%mm0, %%mm2 \n\t"
3150 "psubw %%mm2, %%mm4 \n\t" 3150 "psubw %%mm2, %%mm4 \n\t"
3151 "movq %%mm5, %%mm2 \n\t" 3151 "movq %%mm5, %%mm2 \n\t"
3152 "psubusw %%mm1, %%mm2 \n\t" 3152 "psubusw %%mm1, %%mm2 \n\t"
3153 "psubw %%mm2, %%mm5 \n\t" 3153 "psubw %%mm2, %%mm5 \n\t"
3154 #endif 3154 #endif
3155 "pxor %%mm6, %%mm4 \n\t" 3155 "pxor %%mm6, %%mm4 \n\t"
3156 "pxor %%mm7, %%mm5 \n\t" 3156 "pxor %%mm7, %%mm5 \n\t"
3157 "psubw %%mm6, %%mm4 \n\t" 3157 "psubw %%mm6, %%mm4 \n\t"
3158 "psubw %%mm7, %%mm5 \n\t" 3158 "psubw %%mm7, %%mm5 \n\t"
3159 "packsswb %%mm5, %%mm4 \n\t" 3159 "packsswb %%mm5, %%mm4 \n\t"
3160 "movq %3, %%mm1 \n\t" 3160 "movq %3, %%mm1 \n\t"
3161 "pandn %%mm4, %%mm1 \n\t" 3161 "pandn %%mm4, %%mm1 \n\t"
3162 "movq (%0), %%mm0 \n\t" 3162 "movq (%0), %%mm0 \n\t"
3163 "paddb %%mm1, %%mm0 \n\t" 3163 "paddb %%mm1, %%mm0 \n\t"
3164 "movq %%mm0, (%0) \n\t" 3164 "movq %%mm0, (%0) \n\t"
3165 "movq (%0, %1), %%mm0 \n\t" 3165 "movq (%0, %1), %%mm0 \n\t"
3166 "psubb %%mm1, %%mm0 \n\t" 3166 "psubb %%mm1, %%mm0 \n\t"
3167 "movq %%mm0, (%0, %1) \n\t" 3167 "movq %%mm0, (%0, %1) \n\t"
3168 3168
3169 : "+r" (temp_src) 3169 : "+r" (temp_src)
3170 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask) 3170 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
3171 : "%"REG_a, "%"REG_c 3171 : "%"REG_a, "%"REG_c
3172 ); 3172 );
3173 } 3173 }
3174 /*if(step==16){ 3174 /*if(step==16){
3175 STOP_TIMER("step16") 3175 STOP_TIMER("step16")
3176 }else{ 3176 }else{
3177 STOP_TIMER("stepX") 3177 STOP_TIMER("stepX")
3178 }*/ 3178 }*/
3179 } 3179 }
3180 #endif //HAVE_MMX 3180 #endif //HAVE_MMX
3181 3181
3182 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3182 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3183 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); 3183 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3184 3184
3185 /** 3185 /**
3186 * Copies a block from src to dst and fixes the blacklevel 3186 * Copies a block from src to dst and fixes the blacklevel
3187 * levelFix == 0 -> dont touch the brighness & contrast 3187 * levelFix == 0 -> dont touch the brighness & contrast
3188 */ 3188 */
3189 #undef SCALED_CPY 3189 #undef SCALED_CPY
3190 3190
3191 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, 3191 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
3192 int levelFix, int64_t *packedOffsetAndScale) 3192 int levelFix, int64_t *packedOffsetAndScale)
3193 { 3193 {
3194 #ifndef HAVE_MMX 3194 #ifndef HAVE_MMX
3195 int i; 3195 int i;
3196 #endif 3196 #endif
3197 if(levelFix) 3197 if(levelFix)
3198 { 3198 {
3199 #ifdef HAVE_MMX 3199 #ifdef HAVE_MMX
3200 asm volatile( 3200 asm volatile(
3201 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset 3201 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
3202 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale 3202 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
3203 "lea (%2,%4), %%"REG_a" \n\t" 3203 "lea (%2,%4), %%"REG_a" \n\t"
3204 "lea (%3,%5), %%"REG_d" \n\t" 3204 "lea (%3,%5), %%"REG_d" \n\t"
3205 "pxor %%mm4, %%mm4 \n\t" 3205 "pxor %%mm4, %%mm4 \n\t"
3206 #ifdef HAVE_MMX2 3206 #ifdef HAVE_MMX2
3207 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3207 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3208 "movq " #src1 ", %%mm0 \n\t"\ 3208 "movq " #src1 ", %%mm0 \n\t"\
3209 "movq " #src1 ", %%mm5 \n\t"\ 3209 "movq " #src1 ", %%mm5 \n\t"\
3210 "movq " #src2 ", %%mm1 \n\t"\ 3210 "movq " #src2 ", %%mm1 \n\t"\
3211 "movq " #src2 ", %%mm6 \n\t"\ 3211 "movq " #src2 ", %%mm6 \n\t"\
3212 "punpcklbw %%mm0, %%mm0 \n\t"\ 3212 "punpcklbw %%mm0, %%mm0 \n\t"\
3213 "punpckhbw %%mm5, %%mm5 \n\t"\ 3213 "punpckhbw %%mm5, %%mm5 \n\t"\
3214 "punpcklbw %%mm1, %%mm1 \n\t"\ 3214 "punpcklbw %%mm1, %%mm1 \n\t"\
3215 "punpckhbw %%mm6, %%mm6 \n\t"\ 3215 "punpckhbw %%mm6, %%mm6 \n\t"\
3216 "pmulhuw %%mm3, %%mm0 \n\t"\ 3216 "pmulhuw %%mm3, %%mm0 \n\t"\
3217 "pmulhuw %%mm3, %%mm5 \n\t"\ 3217 "pmulhuw %%mm3, %%mm5 \n\t"\
3218 "pmulhuw %%mm3, %%mm1 \n\t"\ 3218 "pmulhuw %%mm3, %%mm1 \n\t"\
3219 "pmulhuw %%mm3, %%mm6 \n\t"\ 3219 "pmulhuw %%mm3, %%mm6 \n\t"\
3220 "psubw %%mm2, %%mm0 \n\t"\ 3220 "psubw %%mm2, %%mm0 \n\t"\
3221 "psubw %%mm2, %%mm5 \n\t"\ 3221 "psubw %%mm2, %%mm5 \n\t"\
3222 "psubw %%mm2, %%mm1 \n\t"\ 3222 "psubw %%mm2, %%mm1 \n\t"\
3223 "psubw %%mm2, %%mm6 \n\t"\ 3223 "psubw %%mm2, %%mm6 \n\t"\
3224 "packuswb %%mm5, %%mm0 \n\t"\ 3224 "packuswb %%mm5, %%mm0 \n\t"\
3225 "packuswb %%mm6, %%mm1 \n\t"\ 3225 "packuswb %%mm6, %%mm1 \n\t"\
3226 "movq %%mm0, " #dst1 " \n\t"\ 3226 "movq %%mm0, " #dst1 " \n\t"\
3227 "movq %%mm1, " #dst2 " \n\t"\ 3227 "movq %%mm1, " #dst2 " \n\t"\
3228 3228
3229 #else //HAVE_MMX2 3229 #else //HAVE_MMX2
3230 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3230 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3231 "movq " #src1 ", %%mm0 \n\t"\ 3231 "movq " #src1 ", %%mm0 \n\t"\
3232 "movq " #src1 ", %%mm5 \n\t"\ 3232 "movq " #src1 ", %%mm5 \n\t"\
3233 "punpcklbw %%mm4, %%mm0 \n\t"\ 3233 "punpcklbw %%mm4, %%mm0 \n\t"\
3234 "punpckhbw %%mm4, %%mm5 \n\t"\ 3234 "punpckhbw %%mm4, %%mm5 \n\t"\
3235 "psubw %%mm2, %%mm0 \n\t"\ 3235 "psubw %%mm2, %%mm0 \n\t"\
3236 "psubw %%mm2, %%mm5 \n\t"\ 3236 "psubw %%mm2, %%mm5 \n\t"\
3237 "movq " #src2 ", %%mm1 \n\t"\ 3237 "movq " #src2 ", %%mm1 \n\t"\
3238 "psllw $6, %%mm0 \n\t"\ 3238 "psllw $6, %%mm0 \n\t"\
3239 "psllw $6, %%mm5 \n\t"\ 3239 "psllw $6, %%mm5 \n\t"\
3240 "pmulhw %%mm3, %%mm0 \n\t"\ 3240 "pmulhw %%mm3, %%mm0 \n\t"\
3241 "movq " #src2 ", %%mm6 \n\t"\ 3241 "movq " #src2 ", %%mm6 \n\t"\
3242 "pmulhw %%mm3, %%mm5 \n\t"\ 3242 "pmulhw %%mm3, %%mm5 \n\t"\
3243 "punpcklbw %%mm4, %%mm1 \n\t"\ 3243 "punpcklbw %%mm4, %%mm1 \n\t"\
3244 "punpckhbw %%mm4, %%mm6 \n\t"\ 3244 "punpckhbw %%mm4, %%mm6 \n\t"\
3245 "psubw %%mm2, %%mm1 \n\t"\ 3245 "psubw %%mm2, %%mm1 \n\t"\
3246 "psubw %%mm2, %%mm6 \n\t"\ 3246 "psubw %%mm2, %%mm6 \n\t"\
3247 "psllw $6, %%mm1 \n\t"\ 3247 "psllw $6, %%mm1 \n\t"\
3248 "psllw $6, %%mm6 \n\t"\ 3248 "psllw $6, %%mm6 \n\t"\
3249 "pmulhw %%mm3, %%mm1 \n\t"\ 3249 "pmulhw %%mm3, %%mm1 \n\t"\
3250 "pmulhw %%mm3, %%mm6 \n\t"\ 3250 "pmulhw %%mm3, %%mm6 \n\t"\
3251 "packuswb %%mm5, %%mm0 \n\t"\ 3251 "packuswb %%mm5, %%mm0 \n\t"\
3252 "packuswb %%mm6, %%mm1 \n\t"\ 3252 "packuswb %%mm6, %%mm1 \n\t"\
3253 "movq %%mm0, " #dst1 " \n\t"\ 3253 "movq %%mm0, " #dst1 " \n\t"\
3254 "movq %%mm1, " #dst2 " \n\t"\ 3254 "movq %%mm1, " #dst2 " \n\t"\
3255 3255
3256 #endif //HAVE_MMX2 3256 #endif //HAVE_MMX2
3257 #define SCALED_CPY(src1, src2, dst1, dst2)\ 3257 #define SCALED_CPY(src1, src2, dst1, dst2)\
3258 REAL_SCALED_CPY(src1, src2, dst1, dst2) 3258 REAL_SCALED_CPY(src1, src2, dst1, dst2)
3259 3259
3260 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) 3260 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3261 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) 3261 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
3262 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) 3262 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
3263 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" 3263 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
3264 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" 3264 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
3265 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) 3265 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3266 3266
3267 3267
3268 : "=&a" (packedOffsetAndScale) 3268 : "=&a" (packedOffsetAndScale)
3269 : "0" (packedOffsetAndScale), 3269 : "0" (packedOffsetAndScale),
3270 "r"(src), 3270 "r"(src),
3271 "r"(dst), 3271 "r"(dst),
3272 "r" ((long)srcStride), 3272 "r" ((long)srcStride),
3273 "r" ((long)dstStride) 3273 "r" ((long)dstStride)
3274 : "%"REG_d 3274 : "%"REG_d
3275 ); 3275 );
3276 #else //HAVE_MMX 3276 #else //HAVE_MMX
3277 for(i=0; i<8; i++) 3277 for(i=0; i<8; i++)
3278 memcpy( &(dst[dstStride*i]), 3278 memcpy( &(dst[dstStride*i]),
3279 &(src[srcStride*i]), BLOCK_SIZE); 3279 &(src[srcStride*i]), BLOCK_SIZE);
3280 #endif //HAVE_MMX 3280 #endif //HAVE_MMX
3281 } 3281 }
3282 else 3282 else
3283 { 3283 {
3284 #ifdef HAVE_MMX 3284 #ifdef HAVE_MMX
3285 asm volatile( 3285 asm volatile(
3286 "lea (%0,%2), %%"REG_a" \n\t" 3286 "lea (%0,%2), %%"REG_a" \n\t"
3287 "lea (%1,%3), %%"REG_d" \n\t" 3287 "lea (%1,%3), %%"REG_d" \n\t"
3288 3288
3289 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ 3289 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3290 "movq " #src1 ", %%mm0 \n\t"\ 3290 "movq " #src1 ", %%mm0 \n\t"\
3291 "movq " #src2 ", %%mm1 \n\t"\ 3291 "movq " #src2 ", %%mm1 \n\t"\
3292 "movq %%mm0, " #dst1 " \n\t"\ 3292 "movq %%mm0, " #dst1 " \n\t"\
3293 "movq %%mm1, " #dst2 " \n\t"\ 3293 "movq %%mm1, " #dst2 " \n\t"\
3294 3294
3295 #define SIMPLE_CPY(src1, src2, dst1, dst2)\ 3295 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3296 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) 3296 REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3297 3297
3298 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 3298 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3299 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) 3299 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
3300 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) 3300 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
3301 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" 3301 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
3302 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" 3302 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
3303 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) 3303 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3304 3304
3305 : : "r" (src), 3305 : : "r" (src),
3306 "r" (dst), 3306 "r" (dst),
3307 "r" ((long)srcStride), 3307 "r" ((long)srcStride),
3308 "r" ((long)dstStride) 3308 "r" ((long)dstStride)
3309 : "%"REG_a, "%"REG_d 3309 : "%"REG_a, "%"REG_d
3310 ); 3310 );
3311 #else //HAVE_MMX 3311 #else //HAVE_MMX
3312 for(i=0; i<8; i++) 3312 for(i=0; i<8; i++)
3313 memcpy( &(dst[dstStride*i]), 3313 memcpy( &(dst[dstStride*i]),
3314 &(src[srcStride*i]), BLOCK_SIZE); 3314 &(src[srcStride*i]), BLOCK_SIZE);
3315 #endif //HAVE_MMX 3315 #endif //HAVE_MMX
3316 } 3316 }
3317 } 3317 }
3318 3318
3319 /** 3319 /**
3320 * Duplicates the given 8 src pixels ? times upward 3320 * Duplicates the given 8 src pixels ? times upward
3321 */ 3321 */
3322 static inline void RENAME(duplicate)(uint8_t src[], int stride) 3322 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3323 { 3323 {
3324 #ifdef HAVE_MMX 3324 #ifdef HAVE_MMX
3325 asm volatile( 3325 asm volatile(
3326 "movq (%0), %%mm0 \n\t" 3326 "movq (%0), %%mm0 \n\t"
3327 "add %1, %0 \n\t" 3327 "add %1, %0 \n\t"
3328 "movq %%mm0, (%0) \n\t" 3328 "movq %%mm0, (%0) \n\t"
3329 "movq %%mm0, (%0, %1) \n\t" 3329 "movq %%mm0, (%0, %1) \n\t"
3330 "movq %%mm0, (%0, %1, 2) \n\t" 3330 "movq %%mm0, (%0, %1, 2) \n\t"
3331 : "+r" (src) 3331 : "+r" (src)
3332 : "r" ((long)-stride) 3332 : "r" ((long)-stride)
3333 ); 3333 );
3334 #else 3334 #else
3335 int i; 3335 int i;
3336 uint8_t *p=src; 3336 uint8_t *p=src;
3337 for(i=0; i<3; i++) 3337 for(i=0; i<3; i++)
3338 { 3338 {
3339 p-= stride; 3339 p-= stride;
3340 memcpy(p, src, 8); 3340 memcpy(p, src, 8);
3341 } 3341 }
3342 #endif 3342 #endif
3343 } 3343 }
3344 3344
3345 /** 3345 /**
3346 * Filters array of bytes (Y or U or V values) 3346 * Filters array of bytes (Y or U or V values)
3347 */ 3347 */
3348 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3348 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3349 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) 3349 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3350 { 3350 {
3351 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access 3351 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
3352 int x,y; 3352 int x,y;
3353 #ifdef COMPILE_TIME_MODE 3353 #ifdef COMPILE_TIME_MODE
3354 const int mode= COMPILE_TIME_MODE; 3354 const int mode= COMPILE_TIME_MODE;
3355 #else 3355 #else
3356 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; 3356 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3357 #endif 3357 #endif
3358 int black=0, white=255; // blackest black and whitest white in the picture 3358 int black=0, white=255; // blackest black and whitest white in the picture
3359 int QPCorrecture= 256*256; 3359 int QPCorrecture= 256*256;
3360 3360
3361 int copyAhead; 3361 int copyAhead;
3362 #ifdef HAVE_MMX 3362 #ifdef HAVE_MMX
3363 int i; 3363 int i;
3364 #endif 3364 #endif
3365 3365
3366 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; 3366 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3367 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; 3367 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3368 3368
3369 //FIXME remove 3369 //FIXME remove
3370 uint64_t * const yHistogram= c.yHistogram; 3370 uint64_t * const yHistogram= c.yHistogram;
3371 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; 3371 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3372 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; 3372 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3373 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; 3373 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3374 3374
3375 #ifdef HAVE_MMX 3375 #ifdef HAVE_MMX
3376 for(i=0; i<57; i++){ 3376 for(i=0; i<57; i++){
3377 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; 3377 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3378 int threshold= offset*2 + 1; 3378 int threshold= offset*2 + 1;
3379 c.mmxDcOffset[i]= 0x7F - offset; 3379 c.mmxDcOffset[i]= 0x7F - offset;
3380 c.mmxDcThreshold[i]= 0x7F - threshold; 3380 c.mmxDcThreshold[i]= 0x7F - threshold;
3381 c.mmxDcOffset[i]*= 0x0101010101010101LL; 3381 c.mmxDcOffset[i]*= 0x0101010101010101LL;
3382 c.mmxDcThreshold[i]*= 0x0101010101010101LL; 3382 c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3383 } 3383 }
3384 #endif 3384 #endif
3385 3385
3386 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; 3386 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3387 else if( (mode & LINEAR_BLEND_DEINT_FILTER) 3387 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3388 || (mode & FFMPEG_DEINT_FILTER) 3388 || (mode & FFMPEG_DEINT_FILTER)
3389 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; 3389 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3390 else if( (mode & V_DEBLOCK) 3390 else if( (mode & V_DEBLOCK)
3391 || (mode & LINEAR_IPOL_DEINT_FILTER) 3391 || (mode & LINEAR_IPOL_DEINT_FILTER)
3392 || (mode & MEDIAN_DEINT_FILTER) 3392 || (mode & MEDIAN_DEINT_FILTER)
3393 || (mode & V_A_DEBLOCK)) copyAhead=13; 3393 || (mode & V_A_DEBLOCK)) copyAhead=13;
3394 else if(mode & V_X1_FILTER) copyAhead=11; 3394 else if(mode & V_X1_FILTER) copyAhead=11;
3395 // else if(mode & V_RK1_FILTER) copyAhead=10; 3395 // else if(mode & V_RK1_FILTER) copyAhead=10;
3396 else if(mode & DERING) copyAhead=9; 3396 else if(mode & DERING) copyAhead=9;
3397 else copyAhead=8; 3397 else copyAhead=8;
3398 3398
3399 copyAhead-= 8; 3399 copyAhead-= 8;
3400 3400
3401 if(!isColor) 3401 if(!isColor)
3402 { 3402 {
3403 uint64_t sum= 0; 3403 uint64_t sum= 0;
3404 int i; 3404 int i;
3405 uint64_t maxClipped; 3405 uint64_t maxClipped;
3406 uint64_t clipped; 3406 uint64_t clipped;
3407 double scale; 3407 double scale;
3408 3408
3409 c.frameNum++; 3409 c.frameNum++;
3410 // first frame is fscked so we ignore it 3410 // first frame is fscked so we ignore it
3411 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; 3411 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3412 3412
3413 for(i=0; i<256; i++) 3413 for(i=0; i<256; i++)
3414 { 3414 {
3415 sum+= yHistogram[i]; 3415 sum+= yHistogram[i];
3416 // printf("%d ", yHistogram[i]); 3416 // printf("%d ", yHistogram[i]);
3417 } 3417 }
3418 // printf("\n\n"); 3418 // printf("\n\n");
3419 3419
3420 /* we allways get a completly black picture first */ 3420 /* we allways get a completly black picture first */
3421 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); 3421 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3422 3422
3423 clipped= sum; 3423 clipped= sum;
3424 for(black=255; black>0; black--) 3424 for(black=255; black>0; black--)
3425 { 3425 {
3426 if(clipped < maxClipped) break; 3426 if(clipped < maxClipped) break;
3427 clipped-= yHistogram[black]; 3427 clipped-= yHistogram[black];
3428 } 3428 }
3429 3429
3430 clipped= sum; 3430 clipped= sum;
3431 for(white=0; white<256; white++) 3431 for(white=0; white<256; white++)
3432 { 3432 {
3433 if(clipped < maxClipped) break; 3433 if(clipped < maxClipped) break;
3434 clipped-= yHistogram[white]; 3434 clipped-= yHistogram[white];
3435 } 3435 }
3436 3436
3437 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); 3437 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3438 3438
3439 #ifdef HAVE_MMX2 3439 #ifdef HAVE_MMX2
3440 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); 3440 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3441 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; 3441 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3442 #else 3442 #else
3443 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); 3443 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3444 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; 3444 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3445 #endif 3445 #endif
3446 3446
3447 c.packedYOffset|= c.packedYOffset<<32; 3447 c.packedYOffset|= c.packedYOffset<<32;
3448 c.packedYOffset|= c.packedYOffset<<16; 3448 c.packedYOffset|= c.packedYOffset<<16;
3449 3449
3450 c.packedYScale|= c.packedYScale<<32; 3450 c.packedYScale|= c.packedYScale<<32;
3451 c.packedYScale|= c.packedYScale<<16; 3451 c.packedYScale|= c.packedYScale<<16;
3452 3452
3453 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); 3453 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
3454 else QPCorrecture= 256*256; 3454 else QPCorrecture= 256*256;
3455 } 3455 }
3456 else 3456 else
3457 { 3457 {
3458 c.packedYScale= 0x0100010001000100LL; 3458 c.packedYScale= 0x0100010001000100LL;
3459 c.packedYOffset= 0; 3459 c.packedYOffset= 0;
3460 QPCorrecture= 256*256; 3460 QPCorrecture= 256*256;
3461 } 3461 }
3462 3462
3463 /* copy & deinterlace first row of blocks */ 3463 /* copy & deinterlace first row of blocks */
3464 y=-BLOCK_SIZE; 3464 y=-BLOCK_SIZE;
3465 { 3465 {
3466 uint8_t *srcBlock= &(src[y*srcStride]); 3466 uint8_t *srcBlock= &(src[y*srcStride]);
3467 uint8_t *dstBlock= tempDst + dstStride; 3467 uint8_t *dstBlock= tempDst + dstStride;
3468 3468
3469 // From this point on it is guranteed that we can read and write 16 lines downward 3469 // From this point on it is guranteed that we can read and write 16 lines downward
3470 // finish 1 block before the next otherwise we might have a problem 3470 // finish 1 block before the next otherwise we might have a problem
3471 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3471 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3472 for(x=0; x<width; x+=BLOCK_SIZE) 3472 for(x=0; x<width; x+=BLOCK_SIZE)
3473 { 3473 {
3474 3474
3475 #ifdef HAVE_MMX2 3475 #ifdef HAVE_MMX2
3476 /* 3476 /*
3477 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3477 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3478 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3478 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3479 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3479 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3480 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3480 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3481 */ 3481 */
3482 3482
3483 asm( 3483 asm(
3484 "mov %4, %%"REG_a" \n\t" 3484 "mov %4, %%"REG_a" \n\t"
3485 "shr $2, %%"REG_a" \n\t" 3485 "shr $2, %%"REG_a" \n\t"
3486 "and $6, %%"REG_a" \n\t" 3486 "and $6, %%"REG_a" \n\t"
3487 "add %5, %%"REG_a" \n\t" 3487 "add %5, %%"REG_a" \n\t"
3488 "mov %%"REG_a", %%"REG_d" \n\t" 3488 "mov %%"REG_a", %%"REG_d" \n\t"
3489 "imul %1, %%"REG_a" \n\t" 3489 "imul %1, %%"REG_a" \n\t"
3490 "imul %3, %%"REG_d" \n\t" 3490 "imul %3, %%"REG_d" \n\t"
3491 "prefetchnta 32(%%"REG_a", %0) \n\t" 3491 "prefetchnta 32(%%"REG_a", %0) \n\t"
3492 "prefetcht0 32(%%"REG_d", %2) \n\t" 3492 "prefetcht0 32(%%"REG_d", %2) \n\t"
3493 "add %1, %%"REG_a" \n\t" 3493 "add %1, %%"REG_a" \n\t"
3494 "add %3, %%"REG_d" \n\t" 3494 "add %3, %%"REG_d" \n\t"
3495 "prefetchnta 32(%%"REG_a", %0) \n\t" 3495 "prefetchnta 32(%%"REG_a", %0) \n\t"
3496 "prefetcht0 32(%%"REG_d", %2) \n\t" 3496 "prefetcht0 32(%%"REG_d", %2) \n\t"
3497 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), 3497 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3498 "g" ((long)x), "g" ((long)copyAhead) 3498 "g" ((long)x), "g" ((long)copyAhead)
3499 : "%"REG_a, "%"REG_d 3499 : "%"REG_a, "%"REG_d
3500 ); 3500 );
3501 3501
3502 #elif defined(HAVE_3DNOW) 3502 #elif defined(HAVE_3DNOW)
3503 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 3503 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3504 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3504 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3505 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3505 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3506 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3506 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3507 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3507 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3508 */ 3508 */
3509 #endif 3509 #endif
3510 3510
3511 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, 3511 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3512 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3512 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3513 3513
3514 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); 3514 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3515 3515
3516 if(mode & LINEAR_IPOL_DEINT_FILTER) 3516 if(mode & LINEAR_IPOL_DEINT_FILTER)
3517 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3517 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3518 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3518 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3519 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3519 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3520 else if(mode & MEDIAN_DEINT_FILTER) 3520 else if(mode & MEDIAN_DEINT_FILTER)
3521 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3521 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3522 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3522 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3523 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3523 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3524 else if(mode & FFMPEG_DEINT_FILTER) 3524 else if(mode & FFMPEG_DEINT_FILTER)
3525 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3525 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3526 else if(mode & LOWPASS5_DEINT_FILTER) 3526 else if(mode & LOWPASS5_DEINT_FILTER)
3527 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3527 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3528 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3528 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3529 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3529 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3530 */ 3530 */
3531 dstBlock+=8; 3531 dstBlock+=8;
3532 srcBlock+=8; 3532 srcBlock+=8;
3533 } 3533 }
3534 if(width==ABS(dstStride)) 3534 if(width==ABS(dstStride))
3535 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); 3535 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3536 else 3536 else
3537 { 3537 {
3538 int i; 3538 int i;
3539 for(i=0; i<copyAhead; i++) 3539 for(i=0; i<copyAhead; i++)
3540 { 3540 {
3541 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); 3541 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3542 } 3542 }
3543 } 3543 }
3544 } 3544 }
3545 3545
3546 //printf("\n"); 3546 //printf("\n");
3547 for(y=0; y<height; y+=BLOCK_SIZE) 3547 for(y=0; y<height; y+=BLOCK_SIZE)
3548 { 3548 {
3549 //1% speedup if these are here instead of the inner loop 3549 //1% speedup if these are here instead of the inner loop
3550 uint8_t *srcBlock= &(src[y*srcStride]); 3550 uint8_t *srcBlock= &(src[y*srcStride]);
3551 uint8_t *dstBlock= &(dst[y*dstStride]); 3551 uint8_t *dstBlock= &(dst[y*dstStride]);
3552 #ifdef HAVE_MMX 3552 #ifdef HAVE_MMX
3553 uint8_t *tempBlock1= c.tempBlocks; 3553 uint8_t *tempBlock1= c.tempBlocks;
3554 uint8_t *tempBlock2= c.tempBlocks + 8; 3554 uint8_t *tempBlock2= c.tempBlocks + 8;
3555 #endif 3555 #endif
3556 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; 3556 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3557 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)]; 3557 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)];
3558 int QP=0; 3558 int QP=0;
3559 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 3559 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3560 if not than use a temporary buffer */ 3560 if not than use a temporary buffer */
3561 if(y+15 >= height) 3561 if(y+15 >= height)
3562 { 3562 {
3563 int i; 3563 int i;
3564 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with 3564 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3565 blockcopy to dst later */ 3565 blockcopy to dst later */
3566 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, 3566 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3567 MAX(height-y-copyAhead, 0), srcStride); 3567 MAX(height-y-copyAhead, 0), srcStride);
3568 3568
3569 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ 3569 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3570 for(i=MAX(height-y, 8); i<copyAhead+8; i++) 3570 for(i=MAX(height-y, 8); i<copyAhead+8; i++)
3571 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride)); 3571 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride));
3572 3572
3573 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ 3573 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3574 linecpy(tempDst, dstBlock - dstStride, MIN(height-y+1, copyAhead+1), dstStride); 3574 linecpy(tempDst, dstBlock - dstStride, MIN(height-y+1, copyAhead+1), dstStride);
3575 3575
3576 /* duplicate last line of dst to fill the void upto line (copyAhead) */ 3576 /* duplicate last line of dst to fill the void upto line (copyAhead) */
3577 for(i=height-y+1; i<=copyAhead; i++) 3577 for(i=height-y+1; i<=copyAhead; i++)
3578 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride)); 3578 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride));
3579 3579
3580 dstBlock= tempDst + dstStride; 3580 dstBlock= tempDst + dstStride;
3581 srcBlock= tempSrc; 3581 srcBlock= tempSrc;
3582 } 3582 }
3583 //printf("\n"); 3583 //printf("\n");
3584 3584
3585 // From this point on it is guranteed that we can read and write 16 lines downward 3585 // From this point on it is guranteed that we can read and write 16 lines downward
3586 // finish 1 block before the next otherwise we might have a problem 3586 // finish 1 block before the next otherwise we might have a problem
3587 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3587 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3588 for(x=0; x<width; x+=BLOCK_SIZE) 3588 for(x=0; x<width; x+=BLOCK_SIZE)
3589 { 3589 {
3590 const int stride= dstStride; 3590 const int stride= dstStride;
3591 #ifdef HAVE_MMX 3591 #ifdef HAVE_MMX
3592 uint8_t *tmpXchg; 3592 uint8_t *tmpXchg;
3593 #endif 3593 #endif
3594 if(isColor) 3594 if(isColor)
3595 { 3595 {
3596 QP= QPptr[x>>qpHShift]; 3596 QP= QPptr[x>>qpHShift];
3597 c.nonBQP= nonBQPptr[x>>qpHShift]; 3597 c.nonBQP= nonBQPptr[x>>qpHShift];
3598 } 3598 }
3599 else 3599 else
3600 { 3600 {
3601 QP= QPptr[x>>4]; 3601 QP= QPptr[x>>4];
3602 QP= (QP* QPCorrecture + 256*128)>>16; 3602 QP= (QP* QPCorrecture + 256*128)>>16;
3603 c.nonBQP= nonBQPptr[x>>4]; 3603 c.nonBQP= nonBQPptr[x>>4];
3604 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; 3604 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3605 yHistogram[ srcBlock[srcStride*12 + 4] ]++; 3605 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3606 } 3606 }
3607 c.QP= QP; 3607 c.QP= QP;
3608 #ifdef HAVE_MMX 3608 #ifdef HAVE_MMX
3609 asm volatile( 3609 asm volatile(
3610 "movd %1, %%mm7 \n\t" 3610 "movd %1, %%mm7 \n\t"
3611 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 3611 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3612 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 3612 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3613 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 3613 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3614 "movq %%mm7, %0 \n\t" 3614 "movq %%mm7, %0 \n\t"
3615 : "=m" (c.pQPb) 3615 : "=m" (c.pQPb)
3616 : "r" (QP) 3616 : "r" (QP)
3617 ); 3617 );
3618 #endif 3618 #endif
3619 3619
3620 3620
3621 #ifdef HAVE_MMX2 3621 #ifdef HAVE_MMX2
3622 /* 3622 /*
3623 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3623 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3624 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3624 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3625 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3625 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3626 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3626 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3627 */ 3627 */
3628 3628
3629 asm( 3629 asm(
3630 "mov %4, %%"REG_a" \n\t" 3630 "mov %4, %%"REG_a" \n\t"
3631 "shr $2, %%"REG_a" \n\t" 3631 "shr $2, %%"REG_a" \n\t"
3632 "and $6, %%"REG_a" \n\t" 3632 "and $6, %%"REG_a" \n\t"
3633 "add %5, %%"REG_a" \n\t" 3633 "add %5, %%"REG_a" \n\t"
3634 "mov %%"REG_a", %%"REG_d" \n\t" 3634 "mov %%"REG_a", %%"REG_d" \n\t"
3635 "imul %1, %%"REG_a" \n\t" 3635 "imul %1, %%"REG_a" \n\t"
3636 "imul %3, %%"REG_d" \n\t" 3636 "imul %3, %%"REG_d" \n\t"
3637 "prefetchnta 32(%%"REG_a", %0) \n\t" 3637 "prefetchnta 32(%%"REG_a", %0) \n\t"
3638 "prefetcht0 32(%%"REG_d", %2) \n\t" 3638 "prefetcht0 32(%%"REG_d", %2) \n\t"
3639 "add %1, %%"REG_a" \n\t" 3639 "add %1, %%"REG_a" \n\t"
3640 "add %3, %%"REG_d" \n\t" 3640 "add %3, %%"REG_d" \n\t"
3641 "prefetchnta 32(%%"REG_a", %0) \n\t" 3641 "prefetchnta 32(%%"REG_a", %0) \n\t"
3642 "prefetcht0 32(%%"REG_d", %2) \n\t" 3642 "prefetcht0 32(%%"REG_d", %2) \n\t"
3643 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), 3643 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3644 "g" ((long)x), "g" ((long)copyAhead) 3644 "g" ((long)x), "g" ((long)copyAhead)
3645 : "%"REG_a, "%"REG_d 3645 : "%"REG_a, "%"REG_d
3646 ); 3646 );
3647 3647
3648 #elif defined(HAVE_3DNOW) 3648 #elif defined(HAVE_3DNOW)
3649 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 3649 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
3650 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3650 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3651 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3651 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3652 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3652 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3653 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3653 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3654 */ 3654 */
3655 #endif 3655 #endif
3656 3656
3657 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, 3657 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3658 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3658 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3659 3659
3660 if(mode & LINEAR_IPOL_DEINT_FILTER) 3660 if(mode & LINEAR_IPOL_DEINT_FILTER)
3661 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3661 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3662 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3662 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3663 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3663 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3664 else if(mode & MEDIAN_DEINT_FILTER) 3664 else if(mode & MEDIAN_DEINT_FILTER)
3665 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3665 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3666 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3666 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3667 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3667 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3668 else if(mode & FFMPEG_DEINT_FILTER) 3668 else if(mode & FFMPEG_DEINT_FILTER)
3669 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3669 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3670 else if(mode & LOWPASS5_DEINT_FILTER) 3670 else if(mode & LOWPASS5_DEINT_FILTER)
3671 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3671 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3672 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3672 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3673 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3673 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3674 */ 3674 */
3675 3675
3676 /* only deblock if we have 2 blocks */ 3676 /* only deblock if we have 2 blocks */
3677 if(y + 8 < height) 3677 if(y + 8 < height)
3678 { 3678 {
3679 if(mode & V_X1_FILTER) 3679 if(mode & V_X1_FILTER)
3680 RENAME(vertX1Filter)(dstBlock, stride, &c); 3680 RENAME(vertX1Filter)(dstBlock, stride, &c);
3681 else if(mode & V_DEBLOCK) 3681 else if(mode & V_DEBLOCK)
3682 { 3682 {
3683 const int t= RENAME(vertClassify)(dstBlock, stride, &c); 3683 const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3684 3684
3685 if(t==1) 3685 if(t==1)
3686 RENAME(doVertLowPass)(dstBlock, stride, &c); 3686 RENAME(doVertLowPass)(dstBlock, stride, &c);
3687 else if(t==2) 3687 else if(t==2)
3688 RENAME(doVertDefFilter)(dstBlock, stride, &c); 3688 RENAME(doVertDefFilter)(dstBlock, stride, &c);
3689 }else if(mode & V_A_DEBLOCK){ 3689 }else if(mode & V_A_DEBLOCK){
3690 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); 3690 RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3691 } 3691 }
3692 } 3692 }
3693 3693
3694 #ifdef HAVE_MMX 3694 #ifdef HAVE_MMX
3695 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 3695 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3696 #endif 3696 #endif
3697 /* check if we have a previous block to deblock it with dstBlock */ 3697 /* check if we have a previous block to deblock it with dstBlock */
3698 if(x - 8 >= 0) 3698 if(x - 8 >= 0)
3699 { 3699 {
3700 #ifdef HAVE_MMX 3700 #ifdef HAVE_MMX
3701 if(mode & H_X1_FILTER) 3701 if(mode & H_X1_FILTER)
3702 RENAME(vertX1Filter)(tempBlock1, 16, &c); 3702 RENAME(vertX1Filter)(tempBlock1, 16, &c);
3703 else if(mode & H_DEBLOCK) 3703 else if(mode & H_DEBLOCK)
3704 { 3704 {
3705 //START_TIMER 3705 //START_TIMER
3706 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); 3706 const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3707 //STOP_TIMER("dc & minmax") 3707 //STOP_TIMER("dc & minmax")
3708 if(t==1) 3708 if(t==1)
3709 RENAME(doVertLowPass)(tempBlock1, 16, &c); 3709 RENAME(doVertLowPass)(tempBlock1, 16, &c);
3710 else if(t==2) 3710 else if(t==2)
3711 RENAME(doVertDefFilter)(tempBlock1, 16, &c); 3711 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3712 }else if(mode & H_A_DEBLOCK){ 3712 }else if(mode & H_A_DEBLOCK){
3713 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); 3713 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3714 } 3714 }
3715 3715
3716 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); 3716 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3717 3717
3718 #else 3718 #else
3719 if(mode & H_X1_FILTER) 3719 if(mode & H_X1_FILTER)
3720 horizX1Filter(dstBlock-4, stride, QP); 3720 horizX1Filter(dstBlock-4, stride, QP);
3721 else if(mode & H_DEBLOCK) 3721 else if(mode & H_DEBLOCK)
3722 { 3722 {
3723 #ifdef HAVE_ALTIVEC 3723 #ifdef HAVE_ALTIVEC
3724 unsigned char __attribute__ ((aligned(16))) tempBlock[272]; 3724 unsigned char __attribute__ ((aligned(16))) tempBlock[272];
3725 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); 3725 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3726 3726
3727 const int t=vertClassify_altivec(tempBlock-48, 16, &c); 3727 const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3728 if(t==1) { 3728 if(t==1) {
3729 doVertLowPass_altivec(tempBlock-48, 16, &c); 3729 doVertLowPass_altivec(tempBlock-48, 16, &c);
3730 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3730 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3731 } 3731 }
3732 else if(t==2) { 3732 else if(t==2) {
3733 doVertDefFilter_altivec(tempBlock-48, 16, &c); 3733 doVertDefFilter_altivec(tempBlock-48, 16, &c);
3734 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3734 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3735 } 3735 }
3736 #else 3736 #else
3737 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); 3737 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3738 3738
3739 if(t==1) 3739 if(t==1)
3740 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); 3740 RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3741 else if(t==2) 3741 else if(t==2)
3742 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); 3742 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3743 #endif 3743 #endif
3744 }else if(mode & H_A_DEBLOCK){ 3744 }else if(mode & H_A_DEBLOCK){
3745 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); 3745 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3746 } 3746 }
3747 #endif //HAVE_MMX 3747 #endif //HAVE_MMX
3748 if(mode & DERING) 3748 if(mode & DERING)
3749 { 3749 {
3750 //FIXME filter first line 3750 //FIXME filter first line
3751 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); 3751 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3752 } 3752 }
3753 3753
3754 if(mode & TEMP_NOISE_FILTER) 3754 if(mode & TEMP_NOISE_FILTER)
3755 { 3755 {
3756 RENAME(tempNoiseReducer)(dstBlock-8, stride, 3756 RENAME(tempNoiseReducer)(dstBlock-8, stride,
3757 c.tempBlured[isColor] + y*dstStride + x, 3757 c.tempBlured[isColor] + y*dstStride + x,
3758 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3758 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3759 c.ppMode.maxTmpNoise); 3759 c.ppMode.maxTmpNoise);
3760 } 3760 }
3761 } 3761 }
3762 3762
3763 dstBlock+=8; 3763 dstBlock+=8;
3764 srcBlock+=8; 3764 srcBlock+=8;
3765 3765
3766 #ifdef HAVE_MMX 3766 #ifdef HAVE_MMX
3767 tmpXchg= tempBlock1; 3767 tmpXchg= tempBlock1;
3768 tempBlock1= tempBlock2; 3768 tempBlock1= tempBlock2;
3769 tempBlock2 = tmpXchg; 3769 tempBlock2 = tmpXchg;
3770 #endif 3770 #endif
3771 } 3771 }
3772 3772
3773 if(mode & DERING) 3773 if(mode & DERING)
3774 { 3774 {
3775 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); 3775 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3776 } 3776 }
3777 3777
3778 if((mode & TEMP_NOISE_FILTER)) 3778 if((mode & TEMP_NOISE_FILTER))
3779 { 3779 {
3780 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, 3780 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3781 c.tempBlured[isColor] + y*dstStride + x, 3781 c.tempBlured[isColor] + y*dstStride + x,
3782 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3782 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3783 c.ppMode.maxTmpNoise); 3783 c.ppMode.maxTmpNoise);
3784 } 3784 }
3785 3785
3786 /* did we use a tmp buffer for the last lines*/ 3786 /* did we use a tmp buffer for the last lines*/
3787 if(y+15 >= height) 3787 if(y+15 >= height)
3788 { 3788 {
3789 uint8_t *dstBlock= &(dst[y*dstStride]); 3789 uint8_t *dstBlock= &(dst[y*dstStride]);
3790 if(width==ABS(dstStride)) 3790 if(width==ABS(dstStride))
3791 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); 3791 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3792 else 3792 else
3793 { 3793 {
3794 int i; 3794 int i;
3795 for(i=0; i<height-y; i++) 3795 for(i=0; i<height-y; i++)
3796 { 3796 {
3797 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); 3797 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3798 } 3798 }
3799 } 3799 }
3800 } 3800 }
3801 /* 3801 /*
3802 for(x=0; x<width; x+=32) 3802 for(x=0; x<width; x+=32)
3803 { 3803 {
3804 volatile int i; 3804 volatile int i;
3805 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] 3805 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3806 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] 3806 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3807 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 3807 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3808 // + dstBlock[x +13*dstStride] 3808 // + dstBlock[x +13*dstStride]
3809 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 3809 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3810 }*/ 3810 }*/
3811 } 3811 }
3812 #ifdef HAVE_3DNOW 3812 #ifdef HAVE_3DNOW
3813 asm volatile("femms"); 3813 asm volatile("femms");
3814 #elif defined (HAVE_MMX) 3814 #elif defined (HAVE_MMX)
3815 asm volatile("emms"); 3815 asm volatile("emms");
3816 #endif 3816 #endif
3817 3817
3818 #ifdef DEBUG_BRIGHTNESS 3818 #ifdef DEBUG_BRIGHTNESS
3819 if(!isColor) 3819 if(!isColor)
3820 { 3820 {
3821 int max=1; 3821 int max=1;
3822 int i; 3822 int i;
3823 for(i=0; i<256; i++) 3823 for(i=0; i<256; i++)
3824 if(yHistogram[i] > max) max=yHistogram[i]; 3824 if(yHistogram[i] > max) max=yHistogram[i];
3825 3825
3826 for(i=1; i<256; i++) 3826 for(i=1; i<256; i++)
3827 { 3827 {
3828 int x; 3828 int x;
3829 int start=yHistogram[i-1]/(max/256+1); 3829 int start=yHistogram[i-1]/(max/256+1);
3830 int end=yHistogram[i]/(max/256+1); 3830 int end=yHistogram[i]/(max/256+1);
3831 int inc= end > start ? 1 : -1; 3831 int inc= end > start ? 1 : -1;
3832 for(x=start; x!=end+inc; x+=inc) 3832 for(x=start; x!=end+inc; x+=inc)
3833 dst[ i*dstStride + x]+=128; 3833 dst[ i*dstStride + x]+=128;
3834 } 3834 }
3835 3835
3836 for(i=0; i<100; i+=2) 3836 for(i=0; i<100; i+=2)
3837 { 3837 {
3838 dst[ (white)*dstStride + i]+=128; 3838 dst[ (white)*dstStride + i]+=128;
3839 dst[ (black)*dstStride + i]+=128; 3839 dst[ (black)*dstStride + i]+=128;
3840 } 3840 }
3841 3841
3842 } 3842 }
3843 #endif 3843 #endif
3844 3844
3845 *c2= c; //copy local context back 3845 *c2= c; //copy local context back
3846 3846
3847 } 3847 }