comparison postprocess_template.c @ 95:c24dab9bca80 libpostproc

cosmetics: Fix indentation to be 4 spaces and consistently place {}.
author diego
date Sat, 22 Mar 2008 15:46:34 +0000
parents a8ed701a0873
children 404a656698ff
comparison
equal deleted inserted replaced
94:094413c45b0f 95:c24dab9bca80
40 40
41 #ifdef HAVE_MMX2 41 #ifdef HAVE_MMX2
42 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 42 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
43 #elif defined (HAVE_MMX) 43 #elif defined (HAVE_MMX)
44 #define PMINUB(b,a,t) \ 44 #define PMINUB(b,a,t) \
45 "movq " #a ", " #t " \n\t"\ 45 "movq " #a ", " #t " \n\t"\
46 "psubusb " #b ", " #t " \n\t"\ 46 "psubusb " #b ", " #t " \n\t"\
47 "psubb " #t ", " #a " \n\t" 47 "psubb " #t ", " #a " \n\t"
48 #endif 48 #endif
49 49
50 #ifdef HAVE_MMX2 50 #ifdef HAVE_MMX2
51 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 51 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
52 #elif defined (HAVE_MMX) 52 #elif defined (HAVE_MMX)
53 #define PMAXUB(a,b) \ 53 #define PMAXUB(a,b) \
54 "psubusb " #a ", " #b " \n\t"\ 54 "psubusb " #a ", " #b " \n\t"\
55 "paddb " #a ", " #b " \n\t" 55 "paddb " #a ", " #b " \n\t"
56 #endif 56 #endif
57 57
58 //FIXME? |255-0| = 1 (should not be a problem ...) 58 //FIXME? |255-0| = 1 (should not be a problem ...)
59 #ifdef HAVE_MMX 59 #ifdef HAVE_MMX
60 /** 60 /**
61 * Check if the middle 8x8 Block in the given 8x16 block is flat 61 * Check if the middle 8x8 Block in the given 8x16 block is flat
62 */ 62 */
63 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 63 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
64 int numEq= 0, dcOk; 64 int numEq= 0, dcOk;
65 src+= stride*4; // src points to begin of the 8x8 Block 65 src+= stride*4; // src points to begin of the 8x8 Block
66 asm volatile( 66 asm volatile(
67 "movq %0, %%mm7 \n\t" 67 "movq %0, %%mm7 \n\t"
68 "movq %1, %%mm6 \n\t" 68 "movq %1, %%mm6 \n\t"
69 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 69 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
70 ); 70 );
71 71
72 asm volatile( 72 asm volatile(
73 "lea (%2, %3), %%"REG_a" \n\t" 73 "lea (%2, %3), %%"REG_a" \n\t"
74 // 0 1 2 3 4 5 6 7 8 9 74 // 0 1 2 3 4 5 6 7 8 9
75 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 75 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
76 76
77 "movq (%2), %%mm0 \n\t" 77 "movq (%2), %%mm0 \n\t"
78 "movq (%%"REG_a"), %%mm1 \n\t" 78 "movq (%%"REG_a"), %%mm1 \n\t"
79 "movq %%mm0, %%mm3 \n\t" 79 "movq %%mm0, %%mm3 \n\t"
80 "movq %%mm0, %%mm4 \n\t" 80 "movq %%mm0, %%mm4 \n\t"
81 PMAXUB(%%mm1, %%mm4) 81 PMAXUB(%%mm1, %%mm4)
82 PMINUB(%%mm1, %%mm3, %%mm5) 82 PMINUB(%%mm1, %%mm3, %%mm5)
83 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 83 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
84 "paddb %%mm7, %%mm0 \n\t" 84 "paddb %%mm7, %%mm0 \n\t"
85 "pcmpgtb %%mm6, %%mm0 \n\t" 85 "pcmpgtb %%mm6, %%mm0 \n\t"
86 86
87 "movq (%%"REG_a",%3), %%mm2 \n\t" 87 "movq (%%"REG_a",%3), %%mm2 \n\t"
88 PMAXUB(%%mm2, %%mm4) 88 PMAXUB(%%mm2, %%mm4)
89 PMINUB(%%mm2, %%mm3, %%mm5) 89 PMINUB(%%mm2, %%mm3, %%mm5)
90 "psubb %%mm2, %%mm1 \n\t" 90 "psubb %%mm2, %%mm1 \n\t"
91 "paddb %%mm7, %%mm1 \n\t" 91 "paddb %%mm7, %%mm1 \n\t"
92 "pcmpgtb %%mm6, %%mm1 \n\t" 92 "pcmpgtb %%mm6, %%mm1 \n\t"
93 "paddb %%mm1, %%mm0 \n\t" 93 "paddb %%mm1, %%mm0 \n\t"
94 94
95 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 95 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
96 PMAXUB(%%mm1, %%mm4) 96 PMAXUB(%%mm1, %%mm4)
97 PMINUB(%%mm1, %%mm3, %%mm5) 97 PMINUB(%%mm1, %%mm3, %%mm5)
98 "psubb %%mm1, %%mm2 \n\t" 98 "psubb %%mm1, %%mm2 \n\t"
99 "paddb %%mm7, %%mm2 \n\t" 99 "paddb %%mm7, %%mm2 \n\t"
100 "pcmpgtb %%mm6, %%mm2 \n\t" 100 "pcmpgtb %%mm6, %%mm2 \n\t"
101 "paddb %%mm2, %%mm0 \n\t" 101 "paddb %%mm2, %%mm0 \n\t"
102 102
103 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 103 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
104 104
105 "movq (%2, %3, 4), %%mm2 \n\t" 105 "movq (%2, %3, 4), %%mm2 \n\t"
106 PMAXUB(%%mm2, %%mm4) 106 PMAXUB(%%mm2, %%mm4)
107 PMINUB(%%mm2, %%mm3, %%mm5) 107 PMINUB(%%mm2, %%mm3, %%mm5)
108 "psubb %%mm2, %%mm1 \n\t" 108 "psubb %%mm2, %%mm1 \n\t"
109 "paddb %%mm7, %%mm1 \n\t" 109 "paddb %%mm7, %%mm1 \n\t"
110 "pcmpgtb %%mm6, %%mm1 \n\t" 110 "pcmpgtb %%mm6, %%mm1 \n\t"
111 "paddb %%mm1, %%mm0 \n\t" 111 "paddb %%mm1, %%mm0 \n\t"
112 112
113 "movq (%%"REG_a"), %%mm1 \n\t" 113 "movq (%%"REG_a"), %%mm1 \n\t"
114 PMAXUB(%%mm1, %%mm4) 114 PMAXUB(%%mm1, %%mm4)
115 PMINUB(%%mm1, %%mm3, %%mm5) 115 PMINUB(%%mm1, %%mm3, %%mm5)
116 "psubb %%mm1, %%mm2 \n\t" 116 "psubb %%mm1, %%mm2 \n\t"
117 "paddb %%mm7, %%mm2 \n\t" 117 "paddb %%mm7, %%mm2 \n\t"
118 "pcmpgtb %%mm6, %%mm2 \n\t" 118 "pcmpgtb %%mm6, %%mm2 \n\t"
119 "paddb %%mm2, %%mm0 \n\t" 119 "paddb %%mm2, %%mm0 \n\t"
120 120
121 "movq (%%"REG_a", %3), %%mm2 \n\t" 121 "movq (%%"REG_a", %3), %%mm2 \n\t"
122 PMAXUB(%%mm2, %%mm4) 122 PMAXUB(%%mm2, %%mm4)
123 PMINUB(%%mm2, %%mm3, %%mm5) 123 PMINUB(%%mm2, %%mm3, %%mm5)
124 "psubb %%mm2, %%mm1 \n\t" 124 "psubb %%mm2, %%mm1 \n\t"
125 "paddb %%mm7, %%mm1 \n\t" 125 "paddb %%mm7, %%mm1 \n\t"
126 "pcmpgtb %%mm6, %%mm1 \n\t" 126 "pcmpgtb %%mm6, %%mm1 \n\t"
127 "paddb %%mm1, %%mm0 \n\t" 127 "paddb %%mm1, %%mm0 \n\t"
128 128
129 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 129 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
130 PMAXUB(%%mm1, %%mm4) 130 PMAXUB(%%mm1, %%mm4)
131 PMINUB(%%mm1, %%mm3, %%mm5) 131 PMINUB(%%mm1, %%mm3, %%mm5)
132 "psubb %%mm1, %%mm2 \n\t" 132 "psubb %%mm1, %%mm2 \n\t"
133 "paddb %%mm7, %%mm2 \n\t" 133 "paddb %%mm7, %%mm2 \n\t"
134 "pcmpgtb %%mm6, %%mm2 \n\t" 134 "pcmpgtb %%mm6, %%mm2 \n\t"
135 "paddb %%mm2, %%mm0 \n\t" 135 "paddb %%mm2, %%mm0 \n\t"
136 "psubusb %%mm3, %%mm4 \n\t" 136 "psubusb %%mm3, %%mm4 \n\t"
137 137
138 " \n\t" 138 " \n\t"
139 #ifdef HAVE_MMX2 139 #ifdef HAVE_MMX2
140 "pxor %%mm7, %%mm7 \n\t" 140 "pxor %%mm7, %%mm7 \n\t"
141 "psadbw %%mm7, %%mm0 \n\t" 141 "psadbw %%mm7, %%mm0 \n\t"
142 #else 142 #else
143 "movq %%mm0, %%mm1 \n\t" 143 "movq %%mm0, %%mm1 \n\t"
144 "psrlw $8, %%mm0 \n\t" 144 "psrlw $8, %%mm0 \n\t"
145 "paddb %%mm1, %%mm0 \n\t" 145 "paddb %%mm1, %%mm0 \n\t"
146 "movq %%mm0, %%mm1 \n\t" 146 "movq %%mm0, %%mm1 \n\t"
147 "psrlq $16, %%mm0 \n\t" 147 "psrlq $16, %%mm0 \n\t"
148 "paddb %%mm1, %%mm0 \n\t" 148 "paddb %%mm1, %%mm0 \n\t"
149 "movq %%mm0, %%mm1 \n\t" 149 "movq %%mm0, %%mm1 \n\t"
150 "psrlq $32, %%mm0 \n\t" 150 "psrlq $32, %%mm0 \n\t"
151 "paddb %%mm1, %%mm0 \n\t" 151 "paddb %%mm1, %%mm0 \n\t"
152 #endif 152 #endif
153 "movq %4, %%mm7 \n\t" // QP,..., QP 153 "movq %4, %%mm7 \n\t" // QP,..., QP
154 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 154 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
155 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 155 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
156 "packssdw %%mm4, %%mm4 \n\t" 156 "packssdw %%mm4, %%mm4 \n\t"
157 "movd %%mm0, %0 \n\t" 157 "movd %%mm0, %0 \n\t"
158 "movd %%mm4, %1 \n\t" 158 "movd %%mm4, %1 \n\t"
159 159
160 : "=r" (numEq), "=r" (dcOk) 160 : "=r" (numEq), "=r" (dcOk)
161 : "r" (src), "r" ((long)stride), "m" (c->pQPb) 161 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
162 : "%"REG_a 162 : "%"REG_a
163 ); 163 );
164 164
165 numEq= (-numEq) &0xFF; 165 numEq= (-numEq) &0xFF;
166 if(numEq > c->ppMode.flatnessThreshold){ 166 if(numEq > c->ppMode.flatnessThreshold){
167 if(dcOk) return 0; 167 if(dcOk) return 0;
168 else return 1; 168 else return 1;
169 }else{ 169 }else{
170 return 2; 170 return 2;
171 } 171 }
172 } 172 }
173 #endif //HAVE_MMX 173 #endif //HAVE_MMX
174 174
175 /** 175 /**
176 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) 176 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
178 */ 178 */
179 #ifndef HAVE_ALTIVEC 179 #ifndef HAVE_ALTIVEC
180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
181 { 181 {
182 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 182 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
183 src+= stride*3; 183 src+= stride*3;
184 asm volatile( //"movv %0 %1 %2\n\t" 184 asm volatile( //"movv %0 %1 %2\n\t"
185 "movq %2, %%mm0 \n\t" // QP,..., QP 185 "movq %2, %%mm0 \n\t" // QP,..., QP
186 "pxor %%mm4, %%mm4 \n\t" 186 "pxor %%mm4, %%mm4 \n\t"
187 187
188 "movq (%0), %%mm6 \n\t" 188 "movq (%0), %%mm6 \n\t"
189 "movq (%0, %1), %%mm5 \n\t" 189 "movq (%0, %1), %%mm5 \n\t"
190 "movq %%mm5, %%mm1 \n\t" 190 "movq %%mm5, %%mm1 \n\t"
191 "movq %%mm6, %%mm2 \n\t" 191 "movq %%mm6, %%mm2 \n\t"
192 "psubusb %%mm6, %%mm5 \n\t" 192 "psubusb %%mm6, %%mm5 \n\t"
193 "psubusb %%mm1, %%mm2 \n\t" 193 "psubusb %%mm1, %%mm2 \n\t"
194 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 194 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
195 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 195 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
196 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 196 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
197 197
198 "pand %%mm2, %%mm6 \n\t" 198 "pand %%mm2, %%mm6 \n\t"
199 "pandn %%mm1, %%mm2 \n\t" 199 "pandn %%mm1, %%mm2 \n\t"
200 "por %%mm2, %%mm6 \n\t"// First Line to Filter 200 "por %%mm2, %%mm6 \n\t"// First Line to Filter
201 201
202 "movq (%0, %1, 8), %%mm5 \n\t" 202 "movq (%0, %1, 8), %%mm5 \n\t"
203 "lea (%0, %1, 4), %%"REG_a" \n\t" 203 "lea (%0, %1, 4), %%"REG_a" \n\t"
204 "lea (%0, %1, 8), %%"REG_c" \n\t" 204 "lea (%0, %1, 8), %%"REG_c" \n\t"
205 "sub %1, %%"REG_c" \n\t" 205 "sub %1, %%"REG_c" \n\t"
206 "add %1, %0 \n\t" // %0 points to line 1 not 0 206 "add %1, %0 \n\t" // %0 points to line 1 not 0
207 "movq (%0, %1, 8), %%mm7 \n\t" 207 "movq (%0, %1, 8), %%mm7 \n\t"
208 "movq %%mm5, %%mm1 \n\t" 208 "movq %%mm5, %%mm1 \n\t"
209 "movq %%mm7, %%mm2 \n\t" 209 "movq %%mm7, %%mm2 \n\t"
210 "psubusb %%mm7, %%mm5 \n\t" 210 "psubusb %%mm7, %%mm5 \n\t"
211 "psubusb %%mm1, %%mm2 \n\t" 211 "psubusb %%mm1, %%mm2 \n\t"
212 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 212 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
213 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 213 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
214 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 214 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
215 215
216 "pand %%mm2, %%mm7 \n\t" 216 "pand %%mm2, %%mm7 \n\t"
217 "pandn %%mm1, %%mm2 \n\t" 217 "pandn %%mm1, %%mm2 \n\t"
218 "por %%mm2, %%mm7 \n\t" // First Line to Filter 218 "por %%mm2, %%mm7 \n\t" // First Line to Filter
219 219
220 220
221 // 1 2 3 4 5 6 7 8 221 // 1 2 3 4 5 6 7 8
222 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 222 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
223 // 6 4 2 2 1 1 223 // 6 4 2 2 1 1
224 // 6 4 4 2 224 // 6 4 4 2
225 // 6 8 2 225 // 6 8 2
226 226
227 "movq (%0, %1), %%mm0 \n\t" // 1 227 "movq (%0, %1), %%mm0 \n\t" // 1
228 "movq %%mm0, %%mm1 \n\t" // 1 228 "movq %%mm0, %%mm1 \n\t" // 1
229 PAVGB(%%mm6, %%mm0) //1 1 /2 229 PAVGB(%%mm6, %%mm0) //1 1 /2
230 PAVGB(%%mm6, %%mm0) //3 1 /4 230 PAVGB(%%mm6, %%mm0) //3 1 /4
231 231
232 "movq (%0, %1, 4), %%mm2 \n\t" // 1 232 "movq (%0, %1, 4), %%mm2 \n\t" // 1
233 "movq %%mm2, %%mm5 \n\t" // 1 233 "movq %%mm2, %%mm5 \n\t" // 1
234 PAVGB((%%REGa), %%mm2) // 11 /2 234 PAVGB((%%REGa), %%mm2) // 11 /2
235 PAVGB((%0, %1, 2), %%mm2) // 211 /4 235 PAVGB((%0, %1, 2), %%mm2) // 211 /4
236 "movq %%mm2, %%mm3 \n\t" // 211 /4 236 "movq %%mm2, %%mm3 \n\t" // 211 /4
237 "movq (%0), %%mm4 \n\t" // 1 237 "movq (%0), %%mm4 \n\t" // 1
238 PAVGB(%%mm4, %%mm3) // 4 211 /8 238 PAVGB(%%mm4, %%mm3) // 4 211 /8
239 PAVGB(%%mm0, %%mm3) //642211 /16 239 PAVGB(%%mm0, %%mm3) //642211 /16
240 "movq %%mm3, (%0) \n\t" // X 240 "movq %%mm3, (%0) \n\t" // X
241 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 241 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
242 "movq %%mm1, %%mm0 \n\t" // 1 242 "movq %%mm1, %%mm0 \n\t" // 1
243 PAVGB(%%mm6, %%mm0) //1 1 /2 243 PAVGB(%%mm6, %%mm0) //1 1 /2
244 "movq %%mm4, %%mm3 \n\t" // 1 244 "movq %%mm4, %%mm3 \n\t" // 1
245 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 245 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
246 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 246 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2
247 PAVGB((%%REGa), %%mm5) // 211 /4 247 PAVGB((%%REGa), %%mm5) // 211 /4
248 PAVGB(%%mm5, %%mm3) // 2 2211 /8 248 PAVGB(%%mm5, %%mm3) // 2 2211 /8
249 PAVGB(%%mm0, %%mm3) //4242211 /16 249 PAVGB(%%mm0, %%mm3) //4242211 /16
250 "movq %%mm3, (%0,%1) \n\t" // X 250 "movq %%mm3, (%0,%1) \n\t" // X
251 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 251 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
252 PAVGB(%%mm4, %%mm6) //11 /2 252 PAVGB(%%mm4, %%mm6) //11 /2
253 "movq (%%"REG_c"), %%mm0 \n\t" // 1 253 "movq (%%"REG_c"), %%mm0 \n\t" // 1
254 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 254 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2
255 "movq %%mm0, %%mm3 \n\t" // 11/2 255 "movq %%mm0, %%mm3 \n\t" // 11/2
256 PAVGB(%%mm1, %%mm0) // 2 11/4 256 PAVGB(%%mm1, %%mm0) // 2 11/4
257 PAVGB(%%mm6, %%mm0) //222 11/8 257 PAVGB(%%mm6, %%mm0) //222 11/8
258 PAVGB(%%mm2, %%mm0) //22242211/16 258 PAVGB(%%mm2, %%mm0) //22242211/16
259 "movq (%0, %1, 2), %%mm2 \n\t" // 1 259 "movq (%0, %1, 2), %%mm2 \n\t" // 1
260 "movq %%mm0, (%0, %1, 2) \n\t" // X 260 "movq %%mm0, (%0, %1, 2) \n\t" // X
261 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 261 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
262 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 262 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
263 PAVGB((%%REGc), %%mm0) // 11 /2 263 PAVGB((%%REGc), %%mm0) // 11 /2
264 PAVGB(%%mm0, %%mm6) //11 11 /4 264 PAVGB(%%mm0, %%mm6) //11 11 /4
265 PAVGB(%%mm1, %%mm4) // 11 /2 265 PAVGB(%%mm1, %%mm4) // 11 /2
266 PAVGB(%%mm2, %%mm1) // 11 /2 266 PAVGB(%%mm2, %%mm1) // 11 /2
267 PAVGB(%%mm1, %%mm6) //1122 11 /8 267 PAVGB(%%mm1, %%mm6) //1122 11 /8
268 PAVGB(%%mm5, %%mm6) //112242211 /16 268 PAVGB(%%mm5, %%mm6) //112242211 /16
269 "movq (%%"REG_a"), %%mm5 \n\t" // 1 269 "movq (%%"REG_a"), %%mm5 \n\t" // 1
270 "movq %%mm6, (%%"REG_a") \n\t" // X 270 "movq %%mm6, (%%"REG_a") \n\t" // X
271 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 271 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
272 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 272 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1
273 PAVGB(%%mm7, %%mm6) // 11 /2 273 PAVGB(%%mm7, %%mm6) // 11 /2
274 PAVGB(%%mm4, %%mm6) // 11 11 /4 274 PAVGB(%%mm4, %%mm6) // 11 11 /4
275 PAVGB(%%mm3, %%mm6) // 11 2211 /8 275 PAVGB(%%mm3, %%mm6) // 11 2211 /8
276 PAVGB(%%mm5, %%mm2) // 11 /2 276 PAVGB(%%mm5, %%mm2) // 11 /2
277 "movq (%0, %1, 4), %%mm4 \n\t" // 1 277 "movq (%0, %1, 4), %%mm4 \n\t" // 1
278 PAVGB(%%mm4, %%mm2) // 112 /4 278 PAVGB(%%mm4, %%mm2) // 112 /4
279 PAVGB(%%mm2, %%mm6) // 112242211 /16 279 PAVGB(%%mm2, %%mm6) // 112242211 /16
280 "movq %%mm6, (%0, %1, 4) \n\t" // X 280 "movq %%mm6, (%0, %1, 4) \n\t" // X
281 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 281 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
282 PAVGB(%%mm7, %%mm1) // 11 2 /4 282 PAVGB(%%mm7, %%mm1) // 11 2 /4
283 PAVGB(%%mm4, %%mm5) // 11 /2 283 PAVGB(%%mm4, %%mm5) // 11 /2
284 PAVGB(%%mm5, %%mm0) // 11 11 /4 284 PAVGB(%%mm5, %%mm0) // 11 11 /4
285 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 285 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1
286 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 286 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
287 PAVGB(%%mm0, %%mm1) // 11224222 /16 287 PAVGB(%%mm0, %%mm1) // 11224222 /16
288 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X 288 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X
289 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 289 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
290 PAVGB((%%REGc), %%mm2) // 112 4 /8 290 PAVGB((%%REGc), %%mm2) // 112 4 /8
291 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 291 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
292 PAVGB(%%mm0, %%mm6) // 1 1 /2 292 PAVGB(%%mm0, %%mm6) // 1 1 /2
293 PAVGB(%%mm7, %%mm6) // 1 12 /4 293 PAVGB(%%mm7, %%mm6) // 1 12 /4
294 PAVGB(%%mm2, %%mm6) // 1122424 /4 294 PAVGB(%%mm2, %%mm6) // 1122424 /4
295 "movq %%mm6, (%%"REG_c") \n\t" // X 295 "movq %%mm6, (%%"REG_c") \n\t" // X
296 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 296 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
297 PAVGB(%%mm7, %%mm5) // 11 2 /4 297 PAVGB(%%mm7, %%mm5) // 11 2 /4
298 PAVGB(%%mm7, %%mm5) // 11 6 /8 298 PAVGB(%%mm7, %%mm5) // 11 6 /8
299 299
300 PAVGB(%%mm3, %%mm0) // 112 /4 300 PAVGB(%%mm3, %%mm0) // 112 /4
301 PAVGB(%%mm0, %%mm5) // 112246 /16 301 PAVGB(%%mm0, %%mm5) // 112246 /16
302 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X 302 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X
303 "sub %1, %0 \n\t" 303 "sub %1, %0 \n\t"
304 304
305 : 305 :
306 : "r" (src), "r" ((long)stride), "m" (c->pQPb) 306 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
307 : "%"REG_a, "%"REG_c 307 : "%"REG_a, "%"REG_c
308 ); 308 );
309 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 309 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
310 const int l1= stride; 310 const int l1= stride;
311 const int l2= stride + l1; 311 const int l2= stride + l1;
312 const int l3= stride + l2; 312 const int l3= stride + l2;
313 const int l4= stride + l3; 313 const int l4= stride + l3;
314 const int l5= stride + l4; 314 const int l5= stride + l4;
315 const int l6= stride + l5; 315 const int l6= stride + l5;
316 const int l7= stride + l6; 316 const int l7= stride + l6;
317 const int l8= stride + l7; 317 const int l8= stride + l7;
318 const int l9= stride + l8; 318 const int l9= stride + l8;
319 int x; 319 int x;
320 src+= stride*3; 320 src+= stride*3;
321 for(x=0; x<BLOCK_SIZE; x++) 321 for(x=0; x<BLOCK_SIZE; x++){
322 { 322 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
323 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; 323 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
324 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; 324
325 325 int sums[10];
326 int sums[10]; 326 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
327 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 327 sums[1] = sums[0] - first + src[l4];
328 sums[1] = sums[0] - first + src[l4]; 328 sums[2] = sums[1] - first + src[l5];
329 sums[2] = sums[1] - first + src[l5]; 329 sums[3] = sums[2] - first + src[l6];
330 sums[3] = sums[2] - first + src[l6]; 330 sums[4] = sums[3] - first + src[l7];
331 sums[4] = sums[3] - first + src[l7]; 331 sums[5] = sums[4] - src[l1] + src[l8];
332 sums[5] = sums[4] - src[l1] + src[l8]; 332 sums[6] = sums[5] - src[l2] + last;
333 sums[6] = sums[5] - src[l2] + last; 333 sums[7] = sums[6] - src[l3] + last;
334 sums[7] = sums[6] - src[l3] + last; 334 sums[8] = sums[7] - src[l4] + last;
335 sums[8] = sums[7] - src[l4] + last; 335 sums[9] = sums[8] - src[l5] + last;
336 sums[9] = sums[8] - src[l5] + last; 336
337 337 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
338 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 338 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
339 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 339 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
340 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 340 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
341 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 341 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
342 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 342 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
343 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
344 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
345 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 345
346 346 src++;
347 src++; 347 }
348 }
349 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 348 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
350 } 349 }
351 #endif //HAVE_ALTIVEC 350 #endif //HAVE_ALTIVEC
352 351
353 #if 0 352 #if 0
364 1 12 12 23 363 1 12 12 23
365 */ 364 */
366 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) 365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
367 { 366 {
368 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 367 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
369 src+= stride*3; 368 src+= stride*3;
370 // FIXME rounding 369 // FIXME rounding
371 asm volatile( 370 asm volatile(
372 "pxor %%mm7, %%mm7 \n\t" // 0 371 "pxor %%mm7, %%mm7 \n\t" // 0
373 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE 372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
374 "leal (%0, %1), %%"REG_a" \n\t" 373 "leal (%0, %1), %%"REG_a" \n\t"
375 "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" 374 "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t"
376 // 0 1 2 3 4 5 6 7 8 9 375 // 0 1 2 3 4 5 6 7 8 9
377 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 376 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
378 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP 377 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
379 "movq %%mm0, %%mm1 \n\t" // QP,..., QP 378 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
380 "paddusb "MANGLE(b02)", %%mm0 \n\t" 379 "paddusb "MANGLE(b02)", %%mm0 \n\t"
381 "psrlw $2, %%mm0 \n\t" 380 "psrlw $2, %%mm0 \n\t"
382 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 381 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
383 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... 382 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
384 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 383 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
385 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 384 "movq (%%"REG_c"), %%mm3 \n\t" // line 5
386 "movq %%mm2, %%mm4 \n\t" // line 4 385 "movq %%mm2, %%mm4 \n\t" // line 4
387 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 386 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
388 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 387 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
389 PAVGB(%%mm3, %%mm5) 388 PAVGB(%%mm3, %%mm5)
390 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 389 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
391 "psubusb %%mm3, %%mm4 \n\t" 390 "psubusb %%mm3, %%mm4 \n\t"
392 "psubusb %%mm2, %%mm3 \n\t" 391 "psubusb %%mm2, %%mm3 \n\t"
393 "por %%mm3, %%mm4 \n\t" // |l4 - l5| 392 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
394 "psubusb %%mm0, %%mm4 \n\t" 393 "psubusb %%mm0, %%mm4 \n\t"
395 "pcmpeqb %%mm7, %%mm4 \n\t" 394 "pcmpeqb %%mm7, %%mm4 \n\t"
396 "pand %%mm4, %%mm5 \n\t" // d/2 395 "pand %%mm4, %%mm5 \n\t" // d/2
397 396
398 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 397 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
399 "paddb %%mm5, %%mm2 \n\t" 398 "paddb %%mm5, %%mm2 \n\t"
400 // "psubb %%mm6, %%mm2 \n\t" 399 // "psubb %%mm6, %%mm2 \n\t"
401 "movq %%mm2, (%0,%1, 4) \n\t" 400 "movq %%mm2, (%0,%1, 4) \n\t"
402 401
403 "movq (%%"REG_c"), %%mm2 \n\t" 402 "movq (%%"REG_c"), %%mm2 \n\t"
404 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 403 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
405 "psubb %%mm5, %%mm2 \n\t" 404 "psubb %%mm5, %%mm2 \n\t"
406 // "psubb %%mm6, %%mm2 \n\t" 405 // "psubb %%mm6, %%mm2 \n\t"
407 "movq %%mm2, (%%"REG_c") \n\t" 406 "movq %%mm2, (%%"REG_c") \n\t"
408 407
409 "paddb %%mm6, %%mm5 \n\t" 408 "paddb %%mm6, %%mm5 \n\t"
410 "psrlw $2, %%mm5 \n\t" 409 "psrlw $2, %%mm5 \n\t"
411 "pand "MANGLE(b3F)", %%mm5 \n\t" 410 "pand "MANGLE(b3F)", %%mm5 \n\t"
412 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 411 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
413 412
414 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" 413 "movq (%%"REG_a", %1, 2), %%mm2 \n\t"
415 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 414 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
416 "paddsb %%mm5, %%mm2 \n\t" 415 "paddsb %%mm5, %%mm2 \n\t"
417 "psubb %%mm6, %%mm2 \n\t" 416 "psubb %%mm6, %%mm2 \n\t"
418 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 417 "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
419 418
420 "movq (%%"REG_c", %1), %%mm2 \n\t" 419 "movq (%%"REG_c", %1), %%mm2 \n\t"
421 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 420 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
422 "psubsb %%mm5, %%mm2 \n\t" 421 "psubsb %%mm5, %%mm2 \n\t"
423 "psubb %%mm6, %%mm2 \n\t" 422 "psubb %%mm6, %%mm2 \n\t"
424 "movq %%mm2, (%%"REG_c", %1) \n\t" 423 "movq %%mm2, (%%"REG_c", %1) \n\t"
425 424
426 : 425 :
427 : "r" (src), "r" ((long)stride) 426 : "r" (src), "r" ((long)stride)
428 : "%"REG_a, "%"REG_c 427 : "%"REG_a, "%"REG_c
429 ); 428 );
430 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 429 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
431 const int l1= stride; 430 const int l1= stride;
432 const int l2= stride + l1; 431 const int l2= stride + l1;
433 const int l3= stride + l2; 432 const int l3= stride + l2;
434 const int l4= stride + l3; 433 const int l4= stride + l3;
435 const int l5= stride + l4; 434 const int l5= stride + l4;
436 const int l6= stride + l5; 435 const int l6= stride + l5;
437 // const int l7= stride + l6; 436 // const int l7= stride + l6;
438 // const int l8= stride + l7; 437 // const int l8= stride + l7;
439 // const int l9= stride + l8; 438 // const int l9= stride + l8;
440 int x; 439 int x;
441 const int QP15= QP + (QP>>2); 440 const int QP15= QP + (QP>>2);
442 src+= stride*3; 441 src+= stride*3;
443 for(x=0; x<BLOCK_SIZE; x++) 442 for(x=0; x<BLOCK_SIZE; x++){
444 { 443 const int v = (src[x+l5] - src[x+l4]);
445 const int v = (src[x+l5] - src[x+l4]); 444 if(FFABS(v) < QP15){
446 if(FFABS(v) < QP15) 445 src[x+l3] +=v>>3;
447 { 446 src[x+l4] +=v>>1;
448 src[x+l3] +=v>>3; 447 src[x+l5] -=v>>1;
449 src[x+l4] +=v>>1; 448 src[x+l6] -=v>>3;
450 src[x+l5] -=v>>1;
451 src[x+l6] -=v>>3;
452
453 }
454 } 449 }
450 }
455 451
456 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 452 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
457 } 453 }
458 #endif //0 454 #endif //0
459 455
465 * MMX2 version does correct clipping C version does not 461 * MMX2 version does correct clipping C version does not
466 */ 462 */
467 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
468 { 464 {
469 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 465 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
470 src+= stride*3; 466 src+= stride*3;
471 467
472 asm volatile( 468 asm volatile(
473 "pxor %%mm7, %%mm7 \n\t" // 0 469 "pxor %%mm7, %%mm7 \n\t" // 0
474 "lea (%0, %1), %%"REG_a" \n\t" 470 "lea (%0, %1), %%"REG_a" \n\t"
475 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 471 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
476 // 0 1 2 3 4 5 6 7 8 9 472 // 0 1 2 3 4 5 6 7 8 9
477 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 473 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
478 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 474 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
479 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 475 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
480 "movq %%mm1, %%mm2 \n\t" // line 4 476 "movq %%mm1, %%mm2 \n\t" // line 4
481 "psubusb %%mm0, %%mm1 \n\t" 477 "psubusb %%mm0, %%mm1 \n\t"
482 "psubusb %%mm2, %%mm0 \n\t" 478 "psubusb %%mm2, %%mm0 \n\t"
483 "por %%mm1, %%mm0 \n\t" // |l2 - l3| 479 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
484 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 480 "movq (%%"REG_c"), %%mm3 \n\t" // line 5
485 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 481 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6
486 "movq %%mm3, %%mm5 \n\t" // line 5 482 "movq %%mm3, %%mm5 \n\t" // line 5
487 "psubusb %%mm4, %%mm3 \n\t" 483 "psubusb %%mm4, %%mm3 \n\t"
488 "psubusb %%mm5, %%mm4 \n\t" 484 "psubusb %%mm5, %%mm4 \n\t"
489 "por %%mm4, %%mm3 \n\t" // |l5 - l6| 485 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
490 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 486 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
491 "movq %%mm2, %%mm1 \n\t" // line 4 487 "movq %%mm2, %%mm1 \n\t" // line 4
492 "psubusb %%mm5, %%mm2 \n\t" 488 "psubusb %%mm5, %%mm2 \n\t"
493 "movq %%mm2, %%mm4 \n\t" 489 "movq %%mm2, %%mm4 \n\t"
494 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 490 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
495 "psubusb %%mm1, %%mm5 \n\t" 491 "psubusb %%mm1, %%mm5 \n\t"
496 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 492 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
497 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 493 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
498 "movq %%mm4, %%mm3 \n\t" // d 494 "movq %%mm4, %%mm3 \n\t" // d
499 "movq %2, %%mm0 \n\t" 495 "movq %2, %%mm0 \n\t"
500 "paddusb %%mm0, %%mm0 \n\t" 496 "paddusb %%mm0, %%mm0 \n\t"
501 "psubusb %%mm0, %%mm4 \n\t" 497 "psubusb %%mm0, %%mm4 \n\t"
502 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 498 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
503 "psubusb "MANGLE(b01)", %%mm3 \n\t" 499 "psubusb "MANGLE(b01)", %%mm3 \n\t"
504 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 500 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
505 501
506 PAVGB(%%mm7, %%mm3) // d/2 502 PAVGB(%%mm7, %%mm3) // d/2
507 "movq %%mm3, %%mm1 \n\t" // d/2 503 "movq %%mm3, %%mm1 \n\t" // d/2
508 PAVGB(%%mm7, %%mm3) // d/4 504 PAVGB(%%mm7, %%mm3) // d/4
509 PAVGB(%%mm1, %%mm3) // 3*d/8 505 PAVGB(%%mm1, %%mm3) // 3*d/8
510 506
511 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 507 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
512 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 508 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
513 "psubusb %%mm3, %%mm0 \n\t" 509 "psubusb %%mm3, %%mm0 \n\t"
514 "pxor %%mm2, %%mm0 \n\t" 510 "pxor %%mm2, %%mm0 \n\t"
515 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 511 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
516 512
517 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 513 "movq (%%"REG_c"), %%mm0 \n\t" // line 5
518 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 514 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
519 "paddusb %%mm3, %%mm0 \n\t" 515 "paddusb %%mm3, %%mm0 \n\t"
520 "pxor %%mm2, %%mm0 \n\t" 516 "pxor %%mm2, %%mm0 \n\t"
521 "movq %%mm0, (%%"REG_c") \n\t" // line 5 517 "movq %%mm0, (%%"REG_c") \n\t" // line 5
522 518
523 PAVGB(%%mm7, %%mm1) // d/4 519 PAVGB(%%mm7, %%mm1) // d/4
524 520
525 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 521 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
526 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 522 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
527 "psubusb %%mm1, %%mm0 \n\t" 523 "psubusb %%mm1, %%mm0 \n\t"
528 "pxor %%mm2, %%mm0 \n\t" 524 "pxor %%mm2, %%mm0 \n\t"
529 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 525 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3
530 526
531 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 527 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6
532 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 528 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
533 "paddusb %%mm1, %%mm0 \n\t" 529 "paddusb %%mm1, %%mm0 \n\t"
534 "pxor %%mm2, %%mm0 \n\t" 530 "pxor %%mm2, %%mm0 \n\t"
535 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 531 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6
536 532
537 PAVGB(%%mm7, %%mm1) // d/8 533 PAVGB(%%mm7, %%mm1) // d/8
538 534
539 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 535 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2
540 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 536 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
541 "psubusb %%mm1, %%mm0 \n\t" 537 "psubusb %%mm1, %%mm0 \n\t"
542 "pxor %%mm2, %%mm0 \n\t" 538 "pxor %%mm2, %%mm0 \n\t"
543 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 539 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2
544 540
545 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 541 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7
546 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 542 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
547 "paddusb %%mm1, %%mm0 \n\t" 543 "paddusb %%mm1, %%mm0 \n\t"
548 "pxor %%mm2, %%mm0 \n\t" 544 "pxor %%mm2, %%mm0 \n\t"
549 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 545 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7
550 546
551 : 547 :
552 : "r" (src), "r" ((long)stride), "m" (co->pQPb) 548 : "r" (src), "r" ((long)stride), "m" (co->pQPb)
553 : "%"REG_a, "%"REG_c 549 : "%"REG_a, "%"REG_c
554 ); 550 );
555 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 551 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
556 552
557 const int l1= stride; 553 const int l1= stride;
558 const int l2= stride + l1; 554 const int l2= stride + l1;
559 const int l3= stride + l2; 555 const int l3= stride + l2;
560 const int l4= stride + l3; 556 const int l4= stride + l3;
561 const int l5= stride + l4; 557 const int l5= stride + l4;
562 const int l6= stride + l5; 558 const int l6= stride + l5;
563 const int l7= stride + l6; 559 const int l7= stride + l6;
564 // const int l8= stride + l7; 560 // const int l8= stride + l7;
565 // const int l9= stride + l8; 561 // const int l9= stride + l8;
566 int x; 562 int x;
567 563
568 src+= stride*3; 564 src+= stride*3;
569 for(x=0; x<BLOCK_SIZE; x++) 565 for(x=0; x<BLOCK_SIZE; x++){
570 { 566 int a= src[l3] - src[l4];
571 int a= src[l3] - src[l4]; 567 int b= src[l4] - src[l5];
572 int b= src[l4] - src[l5]; 568 int c= src[l5] - src[l6];
573 int c= src[l5] - src[l6]; 569
574 570 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
575 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1); 571 d= FFMAX(d, 0);
576 d= FFMAX(d, 0); 572
577 573 if(d < co->QP*2){
578 if(d < co->QP*2) 574 int v = d * FFSIGN(-b);
579 { 575
580 int v = d * FFSIGN(-b); 576 src[l2] +=v>>3;
581 577 src[l3] +=v>>2;
582 src[l2] +=v>>3; 578 src[l4] +=(3*v)>>3;
583 src[l3] +=v>>2; 579 src[l5] -=(3*v)>>3;
584 src[l4] +=(3*v)>>3; 580 src[l6] -=v>>2;
585 src[l5] -=(3*v)>>3; 581 src[l7] -=v>>3;
586 src[l6] -=v>>2;
587 src[l7] -=v>>3;
588
589 }
590 src++;
591 } 582 }
583 src++;
584 }
592 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 585 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
593 } 586 }
594 587
595 #ifndef HAVE_ALTIVEC 588 #ifndef HAVE_ALTIVEC
596 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
597 { 590 {
598 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 591 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
599 /* 592 /*
600 uint8_t tmp[16]; 593 uint8_t tmp[16];
601 const int l1= stride; 594 const int l1= stride;
602 const int l2= stride + l1; 595 const int l2= stride + l1;
603 const int l3= stride + l2; 596 const int l3= stride + l2;
604 const int l4= (int)tmp - (int)src - stride*3; 597 const int l4= (int)tmp - (int)src - stride*3;
605 const int l5= (int)tmp - (int)src - stride*3 + 8; 598 const int l5= (int)tmp - (int)src - stride*3 + 8;
606 const int l6= stride*3 + l3; 599 const int l6= stride*3 + l3;
607 const int l7= stride + l6; 600 const int l7= stride + l6;
608 const int l8= stride + l7; 601 const int l8= stride + l7;
609 602
610 memcpy(tmp, src+stride*7, 8); 603 memcpy(tmp, src+stride*7, 8);
611 memcpy(tmp+8, src+stride*8, 8); 604 memcpy(tmp+8, src+stride*8, 8);
612 */ 605 */
613 src+= stride*4; 606 src+= stride*4;
614 asm volatile( 607 asm volatile(
615 608
616 #if 0 //sligtly more accurate and slightly slower 609 #if 0 //sligtly more accurate and slightly slower
617 "pxor %%mm7, %%mm7 \n\t" // 0 610 "pxor %%mm7, %%mm7 \n\t" // 0
618 "lea (%0, %1), %%"REG_a" \n\t" 611 "lea (%0, %1), %%"REG_a" \n\t"
619 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 612 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
620 // 0 1 2 3 4 5 6 7 613 // 0 1 2 3 4 5 6 7
621 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 614 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
622 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 615 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
623 616
624 617
625 "movq (%0, %1, 2), %%mm0 \n\t" // l2 618 "movq (%0, %1, 2), %%mm0 \n\t" // l2
626 "movq (%0), %%mm1 \n\t" // l0 619 "movq (%0), %%mm1 \n\t" // l0
627 "movq %%mm0, %%mm2 \n\t" // l2 620 "movq %%mm0, %%mm2 \n\t" // l2
628 PAVGB(%%mm7, %%mm0) // ~l2/2 621 PAVGB(%%mm7, %%mm0) // ~l2/2
629 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 622 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
630 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 623 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
631 624
632 "movq (%%"REG_a"), %%mm1 \n\t" // l1 625 "movq (%%"REG_a"), %%mm1 \n\t" // l1
633 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 626 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3
634 "movq %%mm1, %%mm4 \n\t" // l1 627 "movq %%mm1, %%mm4 \n\t" // l1
635 PAVGB(%%mm7, %%mm1) // ~l1/2 628 PAVGB(%%mm7, %%mm1) // ~l1/2
636 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 629 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
637 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 630 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
638 631
639 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 632 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
640 "psubusb %%mm1, %%mm0 \n\t" 633 "psubusb %%mm1, %%mm0 \n\t"
641 "psubusb %%mm4, %%mm1 \n\t" 634 "psubusb %%mm4, %%mm1 \n\t"
642 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 635 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
643 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 636 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
644 637
645 "movq (%0, %1, 4), %%mm0 \n\t" // l4 638 "movq (%0, %1, 4), %%mm0 \n\t" // l4
646 "movq %%mm0, %%mm4 \n\t" // l4 639 "movq %%mm0, %%mm4 \n\t" // l4
647 PAVGB(%%mm7, %%mm0) // ~l4/2 640 PAVGB(%%mm7, %%mm0) // ~l4/2
648 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 641 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
649 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 642 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
650 643
651 "movq (%%"REG_c"), %%mm2 \n\t" // l5 644 "movq (%%"REG_c"), %%mm2 \n\t" // l5
652 "movq %%mm3, %%mm5 \n\t" // l3 645 "movq %%mm3, %%mm5 \n\t" // l3
653 PAVGB(%%mm7, %%mm3) // ~l3/2 646 PAVGB(%%mm7, %%mm3) // ~l3/2
654 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 647 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
655 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 648 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
656 649
657 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 650 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
658 "psubusb %%mm3, %%mm0 \n\t" 651 "psubusb %%mm3, %%mm0 \n\t"
659 "psubusb %%mm6, %%mm3 \n\t" 652 "psubusb %%mm6, %%mm3 \n\t"
660 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 653 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
661 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) 654 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
662 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 655 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
663 656
664 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 657 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6
665 "movq %%mm6, %%mm5 \n\t" // l6 658 "movq %%mm6, %%mm5 \n\t" // l6
666 PAVGB(%%mm7, %%mm6) // ~l6/2 659 PAVGB(%%mm7, %%mm6) // ~l6/2
667 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 660 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
668 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 661 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
669 662
670 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 663 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7
671 "movq %%mm2, %%mm4 \n\t" // l5 664 "movq %%mm2, %%mm4 \n\t" // l5
672 PAVGB(%%mm7, %%mm2) // ~l5/2 665 PAVGB(%%mm7, %%mm2) // ~l5/2
673 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 666 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
674 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 667 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
675 668
676 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 669 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
677 "psubusb %%mm2, %%mm6 \n\t" 670 "psubusb %%mm2, %%mm6 \n\t"
678 "psubusb %%mm4, %%mm2 \n\t" 671 "psubusb %%mm4, %%mm2 \n\t"
679 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 672 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
680 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 673 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
681 674
682 675
683 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 676 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
684 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? 677 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
685 "paddusb "MANGLE(b01)", %%mm4 \n\t" 678 "paddusb "MANGLE(b01)", %%mm4 \n\t"
686 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP 679 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
687 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 680 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
688 "pand %%mm4, %%mm3 \n\t" 681 "pand %%mm4, %%mm3 \n\t"
689 682
690 "movq %%mm3, %%mm1 \n\t" 683 "movq %%mm3, %%mm1 \n\t"
691 // "psubusb "MANGLE(b01)", %%mm3 \n\t" 684 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
692 PAVGB(%%mm7, %%mm3) 685 PAVGB(%%mm7, %%mm3)
693 PAVGB(%%mm7, %%mm3) 686 PAVGB(%%mm7, %%mm3)
694 "paddusb %%mm1, %%mm3 \n\t" 687 "paddusb %%mm1, %%mm3 \n\t"
695 // "paddusb "MANGLE(b01)", %%mm3 \n\t" 688 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
696 689
697 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 690 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3
698 "movq (%0, %1, 4), %%mm5 \n\t" //l4 691 "movq (%0, %1, 4), %%mm5 \n\t" //l4
699 "movq (%0, %1, 4), %%mm4 \n\t" //l4 692 "movq (%0, %1, 4), %%mm4 \n\t" //l4
700 "psubusb %%mm6, %%mm5 \n\t" 693 "psubusb %%mm6, %%mm5 \n\t"
701 "psubusb %%mm4, %%mm6 \n\t" 694 "psubusb %%mm4, %%mm6 \n\t"
702 "por %%mm6, %%mm5 \n\t" // |l3-l4| 695 "por %%mm6, %%mm5 \n\t" // |l3-l4|
703 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) 696 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
704 "pxor %%mm6, %%mm0 \n\t" 697 "pxor %%mm6, %%mm0 \n\t"
705 "pand %%mm0, %%mm3 \n\t" 698 "pand %%mm0, %%mm3 \n\t"
706 PMINUB(%%mm5, %%mm3, %%mm0) 699 PMINUB(%%mm5, %%mm3, %%mm0)
707 700
708 "psubusb "MANGLE(b01)", %%mm3 \n\t" 701 "psubusb "MANGLE(b01)", %%mm3 \n\t"
709 PAVGB(%%mm7, %%mm3) 702 PAVGB(%%mm7, %%mm3)
710 703
711 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 704 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
712 "movq (%0, %1, 4), %%mm2 \n\t" 705 "movq (%0, %1, 4), %%mm2 \n\t"
713 "pxor %%mm6, %%mm0 \n\t" 706 "pxor %%mm6, %%mm0 \n\t"
714 "pxor %%mm6, %%mm2 \n\t" 707 "pxor %%mm6, %%mm2 \n\t"
715 "psubb %%mm3, %%mm0 \n\t" 708 "psubb %%mm3, %%mm0 \n\t"
716 "paddb %%mm3, %%mm2 \n\t" 709 "paddb %%mm3, %%mm2 \n\t"
717 "pxor %%mm6, %%mm0 \n\t" 710 "pxor %%mm6, %%mm0 \n\t"
718 "pxor %%mm6, %%mm2 \n\t" 711 "pxor %%mm6, %%mm2 \n\t"
719 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 712 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
720 "movq %%mm2, (%0, %1, 4) \n\t" 713 "movq %%mm2, (%0, %1, 4) \n\t"
721 #endif //0 714 #endif //0
722 715
723 "lea (%0, %1), %%"REG_a" \n\t" 716 "lea (%0, %1), %%"REG_a" \n\t"
724 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 717 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
725 // 0 1 2 3 4 5 6 7 718 // 0 1 2 3 4 5 6 7
726 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 719 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
727 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 720 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
728 721
729 722
730 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 723 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3
731 "movq (%0, %1, 4), %%mm0 \n\t" // l4 724 "movq (%0, %1, 4), %%mm0 \n\t" // l4
732 "pxor %%mm6, %%mm1 \n\t" // -l3-1 725 "pxor %%mm6, %%mm1 \n\t" // -l3-1
733 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 726 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
734 // mm1=-l3-1, mm0=128-q 727 // mm1=-l3-1, mm0=128-q
735 728
736 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 729 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5
737 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 730 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2
738 "pxor %%mm6, %%mm2 \n\t" // -l5-1 731 "pxor %%mm6, %%mm2 \n\t" // -l5-1
739 "movq %%mm2, %%mm5 \n\t" // -l5-1 732 "movq %%mm2, %%mm5 \n\t" // -l5-1
740 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 733 "movq "MANGLE(b80)", %%mm4 \n\t" // 128
741 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 734 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
742 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 735 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
743 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 736 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
744 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 737 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
745 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 738 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
746 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 739 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
747 740
748 "movq (%%"REG_a"), %%mm2 \n\t" // l1 741 "movq (%%"REG_a"), %%mm2 \n\t" // l1
749 "pxor %%mm6, %%mm2 \n\t" // -l1-1 742 "pxor %%mm6, %%mm2 \n\t" // -l1-1
750 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 743 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
751 PAVGB((%0), %%mm1) // (l0-l3+256)/2 744 PAVGB((%0), %%mm1) // (l0-l3+256)/2
752 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 745 "movq "MANGLE(b80)", %%mm3 \n\t" // 128
753 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 746 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
754 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 747 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
755 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 748 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
756 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 749 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
757 750
758 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 751 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2
759 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 752 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7
760 "pxor %%mm6, %%mm1 \n\t" // -l7-1 753 "pxor %%mm6, %%mm1 \n\t" // -l7-1
761 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 754 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
762 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 755 "movq "MANGLE(b80)", %%mm2 \n\t" // 128
763 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 756 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
764 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 757 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
765 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 758 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
766 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 759 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
767 760
768 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 761 "movq "MANGLE(b00)", %%mm1 \n\t" // 0
769 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 762 "movq "MANGLE(b00)", %%mm5 \n\t" // 0
770 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 763 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
771 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 764 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
772 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| 765 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
773 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| 766 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
774 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 767 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
775 768
776 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 769 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
777 770
778 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 771 "movq "MANGLE(b00)", %%mm7 \n\t" // 0
779 "movq %2, %%mm2 \n\t" // QP 772 "movq %2, %%mm2 \n\t" // QP
780 PAVGB(%%mm6, %%mm2) // 128 + QP/2 773 PAVGB(%%mm6, %%mm2) // 128 + QP/2
781 "psubb %%mm6, %%mm2 \n\t" 774 "psubb %%mm6, %%mm2 \n\t"
782 775
783 "movq %%mm4, %%mm1 \n\t" 776 "movq %%mm4, %%mm1 \n\t"
784 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 777 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
785 "pxor %%mm1, %%mm4 \n\t" 778 "pxor %%mm1, %%mm4 \n\t"
786 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 779 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
787 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 780 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
788 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 781 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
789 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 782 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
790 783
791 "movq %%mm4, %%mm3 \n\t" // d 784 "movq %%mm4, %%mm3 \n\t" // d
792 "psubusb "MANGLE(b01)", %%mm4 \n\t" 785 "psubusb "MANGLE(b01)", %%mm4 \n\t"
793 PAVGB(%%mm7, %%mm4) // d/32 786 PAVGB(%%mm7, %%mm4) // d/32
794 PAVGB(%%mm7, %%mm4) // (d + 32)/64 787 PAVGB(%%mm7, %%mm4) // (d + 32)/64
795 "paddb %%mm3, %%mm4 \n\t" // 5d/64 788 "paddb %%mm3, %%mm4 \n\t" // 5d/64
796 "pand %%mm2, %%mm4 \n\t" 789 "pand %%mm2, %%mm4 \n\t"
797 790
798 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 791 "movq "MANGLE(b80)", %%mm5 \n\t" // 128
799 "psubb %%mm0, %%mm5 \n\t" // q 792 "psubb %%mm0, %%mm5 \n\t" // q
800 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding 793 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
801 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) 794 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
802 "pxor %%mm7, %%mm5 \n\t" 795 "pxor %%mm7, %%mm5 \n\t"
803 796
804 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) 797 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
805 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) 798 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
806 799
807 "pand %%mm7, %%mm4 \n\t" 800 "pand %%mm7, %%mm4 \n\t"
808 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 801 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
809 "movq (%0, %1, 4), %%mm2 \n\t" 802 "movq (%0, %1, 4), %%mm2 \n\t"
810 "pxor %%mm1, %%mm0 \n\t" 803 "pxor %%mm1, %%mm0 \n\t"
811 "pxor %%mm1, %%mm2 \n\t" 804 "pxor %%mm1, %%mm2 \n\t"
812 "paddb %%mm4, %%mm0 \n\t" 805 "paddb %%mm4, %%mm0 \n\t"
813 "psubb %%mm4, %%mm2 \n\t" 806 "psubb %%mm4, %%mm2 \n\t"
814 "pxor %%mm1, %%mm0 \n\t" 807 "pxor %%mm1, %%mm0 \n\t"
815 "pxor %%mm1, %%mm2 \n\t" 808 "pxor %%mm1, %%mm2 \n\t"
816 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 809 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
817 "movq %%mm2, (%0, %1, 4) \n\t" 810 "movq %%mm2, (%0, %1, 4) \n\t"
818 811
819 : 812 :
820 : "r" (src), "r" ((long)stride), "m" (c->pQPb) 813 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
821 : "%"REG_a, "%"REG_c 814 : "%"REG_a, "%"REG_c
822 ); 815 );
823 816
824 /* 817 /*
825 { 818 {
826 int x; 819 int x;
827 src-= stride; 820 src-= stride;
828 for(x=0; x<BLOCK_SIZE; x++) 821 for(x=0; x<BLOCK_SIZE; x++){
829 { 822 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
830 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 823 if(FFABS(middleEnergy)< 8*QP){
831 if(FFABS(middleEnergy)< 8*QP) 824 const int q=(src[l4] - src[l5])/2;
832 { 825 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
833 const int q=(src[l4] - src[l5])/2; 826 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
834 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 827
835 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 828 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
836 829 d= FFMAX(d, 0);
837 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 830
838 d= FFMAX(d, 0); 831 d= (5*d + 32) >> 6;
839 832 d*= FFSIGN(-middleEnergy);
840 d= (5*d + 32) >> 6; 833
841 d*= FFSIGN(-middleEnergy); 834 if(q>0){
842 835 d= d<0 ? 0 : d;
843 if(q>0) 836 d= d>q ? q : d;
844 { 837 }else{
845 d= d<0 ? 0 : d; 838 d= d>0 ? 0 : d;
846 d= d>q ? q : d; 839 d= d<q ? q : d;
847 } 840 }
848 else 841
849 { 842 src[l4]-= d;
850 d= d>0 ? 0 : d; 843 src[l5]+= d;
851 d= d<q ? q : d;
852 }
853
854 src[l4]-= d;
855 src[l5]+= d;
856 }
857 src++;
858 } 844 }
859 src-=8; 845 src++;
860 for(x=0; x<8; x++) 846 }
861 { 847 src-=8;
862 int y; 848 for(x=0; x<8; x++){
863 for(y=4; y<6; y++) 849 int y;
864 { 850 for(y=4; y<6; y++){
865 int d= src[x+y*stride] - tmp[x+(y-4)*8]; 851 int d= src[x+y*stride] - tmp[x+(y-4)*8];
866 int ad= FFABS(d); 852 int ad= FFABS(d);
867 static int max=0; 853 static int max=0;
868 static int sum=0; 854 static int sum=0;
869 static int num=0; 855 static int num=0;
870 static int bias=0; 856 static int bias=0;
871 857
872 if(max<ad) max=ad; 858 if(max<ad) max=ad;
873 sum+= ad>3 ? 1 : 0; 859 sum+= ad>3 ? 1 : 0;
874 if(ad>3) 860 if(ad>3){
875 { 861 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
876 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; 862 }
877 } 863 if(y==4) bias+=d;
878 if(y==4) bias+=d; 864 num++;
879 num++; 865 if(num%1000000 == 0){
880 if(num%1000000 == 0) 866 av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
881 { 867 }
882 av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
883 }
884 }
885 } 868 }
869 }
886 } 870 }
887 */ 871 */
888 #elif defined (HAVE_MMX) 872 #elif defined (HAVE_MMX)
889 src+= stride*4; 873 src+= stride*4;
890 asm volatile( 874 asm volatile(
891 "pxor %%mm7, %%mm7 \n\t" 875 "pxor %%mm7, %%mm7 \n\t"
892 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 876 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
893 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 877 "and "ALIGN_MASK", %%"REG_c" \n\t" // align
894 // 0 1 2 3 4 5 6 7 878 // 0 1 2 3 4 5 6 7
895 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 879 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
896 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 880 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
897 881
898 "movq (%0), %%mm0 \n\t" 882 "movq (%0), %%mm0 \n\t"
899 "movq %%mm0, %%mm1 \n\t" 883 "movq %%mm0, %%mm1 \n\t"
900 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 884 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
901 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 885 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
902 886
903 "movq (%0, %1), %%mm2 \n\t" 887 "movq (%0, %1), %%mm2 \n\t"
904 "lea (%0, %1, 2), %%"REG_a" \n\t" 888 "lea (%0, %1, 2), %%"REG_a" \n\t"
905 "movq %%mm2, %%mm3 \n\t" 889 "movq %%mm2, %%mm3 \n\t"
906 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 890 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
907 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 891 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
908 892
909 "movq (%%"REG_a"), %%mm4 \n\t" 893 "movq (%%"REG_a"), %%mm4 \n\t"
910 "movq %%mm4, %%mm5 \n\t" 894 "movq %%mm4, %%mm5 \n\t"
911 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 895 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
912 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 896 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
913 897
914 "paddw %%mm0, %%mm0 \n\t" // 2L0 898 "paddw %%mm0, %%mm0 \n\t" // 2L0
915 "paddw %%mm1, %%mm1 \n\t" // 2H0 899 "paddw %%mm1, %%mm1 \n\t" // 2H0
916 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 900 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
917 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 901 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
918 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 902 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
919 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 903 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
920 904
921 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 905 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
922 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 906 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
923 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 907 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
924 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 908 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
925 909
926 "movq (%%"REG_a", %1), %%mm2 \n\t" 910 "movq (%%"REG_a", %1), %%mm2 \n\t"
927 "movq %%mm2, %%mm3 \n\t" 911 "movq %%mm2, %%mm3 \n\t"
928 "punpcklbw %%mm7, %%mm2 \n\t" // L3 912 "punpcklbw %%mm7, %%mm2 \n\t" // L3
929 "punpckhbw %%mm7, %%mm3 \n\t" // H3 913 "punpckhbw %%mm7, %%mm3 \n\t" // H3
930 914
931 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 915 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
932 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 916 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
933 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 917 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
934 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 918 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
935 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 919 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
936 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 920 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
937 921
938 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 922 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
939 "movq %%mm0, %%mm1 \n\t" 923 "movq %%mm0, %%mm1 \n\t"
940 "punpcklbw %%mm7, %%mm0 \n\t" // L4 924 "punpcklbw %%mm7, %%mm0 \n\t" // L4
941 "punpckhbw %%mm7, %%mm1 \n\t" // H4 925 "punpckhbw %%mm7, %%mm1 \n\t" // H4
942 926
943 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 927 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
944 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 928 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
945 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 929 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
946 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 930 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
947 "paddw %%mm4, %%mm4 \n\t" // 2L2 931 "paddw %%mm4, %%mm4 \n\t" // 2L2
948 "paddw %%mm5, %%mm5 \n\t" // 2H2 932 "paddw %%mm5, %%mm5 \n\t" // 2H2
949 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 933 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
950 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 934 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
951 935
952 "lea (%%"REG_a", %1), %0 \n\t" 936 "lea (%%"REG_a", %1), %0 \n\t"
953 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 937 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
954 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 938 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
955 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 939 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
956 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 940 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
957 //50 opcodes so far 941 //50 opcodes so far
958 "movq (%0, %1, 2), %%mm2 \n\t" 942 "movq (%0, %1, 2), %%mm2 \n\t"
959 "movq %%mm2, %%mm3 \n\t" 943 "movq %%mm2, %%mm3 \n\t"
960 "punpcklbw %%mm7, %%mm2 \n\t" // L5 944 "punpcklbw %%mm7, %%mm2 \n\t" // L5
961 "punpckhbw %%mm7, %%mm3 \n\t" // H5 945 "punpckhbw %%mm7, %%mm3 \n\t" // H5
962 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 946 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
963 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 947 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
964 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
965 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
966 950
967 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 951 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
968 "punpcklbw %%mm7, %%mm6 \n\t" // L6 952 "punpcklbw %%mm7, %%mm6 \n\t" // L6
969 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 953 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
970 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 954 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
971 "punpckhbw %%mm7, %%mm6 \n\t" // H6 955 "punpckhbw %%mm7, %%mm6 \n\t" // H6
972 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 956 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
973 957
974 "paddw %%mm0, %%mm0 \n\t" // 2L4 958 "paddw %%mm0, %%mm0 \n\t" // 2L4
975 "paddw %%mm1, %%mm1 \n\t" // 2H4 959 "paddw %%mm1, %%mm1 \n\t" // 2H4
976 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 960 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
977 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 961 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
978 962
979 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 963 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
980 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 964 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
981 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 965 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
982 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 966 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
983 967
984 "movq (%0, %1, 4), %%mm2 \n\t" 968 "movq (%0, %1, 4), %%mm2 \n\t"
985 "movq %%mm2, %%mm3 \n\t" 969 "movq %%mm2, %%mm3 \n\t"
986 "punpcklbw %%mm7, %%mm2 \n\t" // L7 970 "punpcklbw %%mm7, %%mm2 \n\t" // L7
987 "punpckhbw %%mm7, %%mm3 \n\t" // H7 971 "punpckhbw %%mm7, %%mm3 \n\t" // H7
988 972
989 "paddw %%mm2, %%mm2 \n\t" // 2L7 973 "paddw %%mm2, %%mm2 \n\t" // 2L7
990 "paddw %%mm3, %%mm3 \n\t" // 2H7 974 "paddw %%mm3, %%mm3 \n\t" // 2H7
991 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 975 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
992 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 976 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
993 977
994 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 978 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
995 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 979 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
996 980
997 #ifdef HAVE_MMX2 981 #ifdef HAVE_MMX2
998 "movq %%mm7, %%mm6 \n\t" // 0 982 "movq %%mm7, %%mm6 \n\t" // 0
999 "psubw %%mm0, %%mm6 \n\t" 983 "psubw %%mm0, %%mm6 \n\t"
1000 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 984 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1001 "movq %%mm7, %%mm6 \n\t" // 0 985 "movq %%mm7, %%mm6 \n\t" // 0
1002 "psubw %%mm1, %%mm6 \n\t" 986 "psubw %%mm1, %%mm6 \n\t"
1003 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 987 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1004 "movq %%mm7, %%mm6 \n\t" // 0 988 "movq %%mm7, %%mm6 \n\t" // 0
1005 "psubw %%mm2, %%mm6 \n\t" 989 "psubw %%mm2, %%mm6 \n\t"
1006 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 990 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1007 "movq %%mm7, %%mm6 \n\t" // 0 991 "movq %%mm7, %%mm6 \n\t" // 0
1008 "psubw %%mm3, %%mm6 \n\t" 992 "psubw %%mm3, %%mm6 \n\t"
1009 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 993 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1010 #else 994 #else
1011 "movq %%mm7, %%mm6 \n\t" // 0 995 "movq %%mm7, %%mm6 \n\t" // 0
1012 "pcmpgtw %%mm0, %%mm6 \n\t" 996 "pcmpgtw %%mm0, %%mm6 \n\t"
1013 "pxor %%mm6, %%mm0 \n\t" 997 "pxor %%mm6, %%mm0 \n\t"
1014 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 998 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1015 "movq %%mm7, %%mm6 \n\t" // 0 999 "movq %%mm7, %%mm6 \n\t" // 0
1016 "pcmpgtw %%mm1, %%mm6 \n\t" 1000 "pcmpgtw %%mm1, %%mm6 \n\t"
1017 "pxor %%mm6, %%mm1 \n\t" 1001 "pxor %%mm6, %%mm1 \n\t"
1018 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 1002 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1019 "movq %%mm7, %%mm6 \n\t" // 0 1003 "movq %%mm7, %%mm6 \n\t" // 0
1020 "pcmpgtw %%mm2, %%mm6 \n\t" 1004 "pcmpgtw %%mm2, %%mm6 \n\t"
1021 "pxor %%mm6, %%mm2 \n\t" 1005 "pxor %%mm6, %%mm2 \n\t"
1022 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 1006 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1023 "movq %%mm7, %%mm6 \n\t" // 0 1007 "movq %%mm7, %%mm6 \n\t" // 0
1024 "pcmpgtw %%mm3, %%mm6 \n\t" 1008 "pcmpgtw %%mm3, %%mm6 \n\t"
1025 "pxor %%mm6, %%mm3 \n\t" 1009 "pxor %%mm6, %%mm3 \n\t"
1026 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 1010 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1027 #endif 1011 #endif
1028 1012
1029 #ifdef HAVE_MMX2 1013 #ifdef HAVE_MMX2
1030 "pminsw %%mm2, %%mm0 \n\t" 1014 "pminsw %%mm2, %%mm0 \n\t"
1031 "pminsw %%mm3, %%mm1 \n\t" 1015 "pminsw %%mm3, %%mm1 \n\t"
1032 #else 1016 #else
1033 "movq %%mm0, %%mm6 \n\t" 1017 "movq %%mm0, %%mm6 \n\t"
1034 "psubusw %%mm2, %%mm6 \n\t" 1018 "psubusw %%mm2, %%mm6 \n\t"
1035 "psubw %%mm6, %%mm0 \n\t" 1019 "psubw %%mm6, %%mm0 \n\t"
1036 "movq %%mm1, %%mm6 \n\t" 1020 "movq %%mm1, %%mm6 \n\t"
1037 "psubusw %%mm3, %%mm6 \n\t" 1021 "psubusw %%mm3, %%mm6 \n\t"
1038 "psubw %%mm6, %%mm1 \n\t" 1022 "psubw %%mm6, %%mm1 \n\t"
1039 #endif 1023 #endif
1040 1024
1041 "movd %2, %%mm2 \n\t" // QP 1025 "movd %2, %%mm2 \n\t" // QP
1042 "punpcklbw %%mm7, %%mm2 \n\t" 1026 "punpcklbw %%mm7, %%mm2 \n\t"
1043 1027
1044 "movq %%mm7, %%mm6 \n\t" // 0 1028 "movq %%mm7, %%mm6 \n\t" // 0
1045 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 1029 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1046 "pxor %%mm6, %%mm4 \n\t" 1030 "pxor %%mm6, %%mm4 \n\t"
1047 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 1031 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1048 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 1032 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1049 "pxor %%mm7, %%mm5 \n\t" 1033 "pxor %%mm7, %%mm5 \n\t"
1050 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 1034 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1051 // 100 opcodes 1035 // 100 opcodes
1052 "psllw $3, %%mm2 \n\t" // 8QP 1036 "psllw $3, %%mm2 \n\t" // 8QP
1053 "movq %%mm2, %%mm3 \n\t" // 8QP 1037 "movq %%mm2, %%mm3 \n\t" // 8QP
1054 "pcmpgtw %%mm4, %%mm2 \n\t" 1038 "pcmpgtw %%mm4, %%mm2 \n\t"
1055 "pcmpgtw %%mm5, %%mm3 \n\t" 1039 "pcmpgtw %%mm5, %%mm3 \n\t"
1056 "pand %%mm2, %%mm4 \n\t" 1040 "pand %%mm2, %%mm4 \n\t"
1057 "pand %%mm3, %%mm5 \n\t" 1041 "pand %%mm3, %%mm5 \n\t"
1058 1042
1059 1043
1060 "psubusw %%mm0, %%mm4 \n\t" // hd 1044 "psubusw %%mm0, %%mm4 \n\t" // hd
1061 "psubusw %%mm1, %%mm5 \n\t" // ld 1045 "psubusw %%mm1, %%mm5 \n\t" // ld
1062 1046
1063 1047
1064 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 1048 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
1065 "pmullw %%mm2, %%mm4 \n\t" 1049 "pmullw %%mm2, %%mm4 \n\t"
1066 "pmullw %%mm2, %%mm5 \n\t" 1050 "pmullw %%mm2, %%mm5 \n\t"
1067 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 1051 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
1068 "paddw %%mm2, %%mm4 \n\t" 1052 "paddw %%mm2, %%mm4 \n\t"
1069 "paddw %%mm2, %%mm5 \n\t" 1053 "paddw %%mm2, %%mm5 \n\t"
1070 "psrlw $6, %%mm4 \n\t" 1054 "psrlw $6, %%mm4 \n\t"
1071 "psrlw $6, %%mm5 \n\t" 1055 "psrlw $6, %%mm5 \n\t"
1072 1056
1073 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 1057 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
1074 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 1058 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
1075 1059
1076 "pxor %%mm2, %%mm2 \n\t" 1060 "pxor %%mm2, %%mm2 \n\t"
1077 "pxor %%mm3, %%mm3 \n\t" 1061 "pxor %%mm3, %%mm3 \n\t"
1078 1062
1079 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 1063 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1080 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 1064 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1081 "pxor %%mm2, %%mm0 \n\t" 1065 "pxor %%mm2, %%mm0 \n\t"
1082 "pxor %%mm3, %%mm1 \n\t" 1066 "pxor %%mm3, %%mm1 \n\t"
1083 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 1067 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1084 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 1068 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1085 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 1069 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1086 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 1070 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1087 1071
1088 "pxor %%mm6, %%mm2 \n\t" 1072 "pxor %%mm6, %%mm2 \n\t"
1089 "pxor %%mm7, %%mm3 \n\t" 1073 "pxor %%mm7, %%mm3 \n\t"
1090 "pand %%mm2, %%mm4 \n\t" 1074 "pand %%mm2, %%mm4 \n\t"
1091 "pand %%mm3, %%mm5 \n\t" 1075 "pand %%mm3, %%mm5 \n\t"
1092 1076
1093 #ifdef HAVE_MMX2 1077 #ifdef HAVE_MMX2
1094 "pminsw %%mm0, %%mm4 \n\t" 1078 "pminsw %%mm0, %%mm4 \n\t"
1095 "pminsw %%mm1, %%mm5 \n\t" 1079 "pminsw %%mm1, %%mm5 \n\t"
1096 #else 1080 #else
1097 "movq %%mm4, %%mm2 \n\t" 1081 "movq %%mm4, %%mm2 \n\t"
1098 "psubusw %%mm0, %%mm2 \n\t" 1082 "psubusw %%mm0, %%mm2 \n\t"
1099 "psubw %%mm2, %%mm4 \n\t" 1083 "psubw %%mm2, %%mm4 \n\t"
1100 "movq %%mm5, %%mm2 \n\t" 1084 "movq %%mm5, %%mm2 \n\t"
1101 "psubusw %%mm1, %%mm2 \n\t" 1085 "psubusw %%mm1, %%mm2 \n\t"
1102 "psubw %%mm2, %%mm5 \n\t" 1086 "psubw %%mm2, %%mm5 \n\t"
1103 #endif 1087 #endif
1104 "pxor %%mm6, %%mm4 \n\t" 1088 "pxor %%mm6, %%mm4 \n\t"
1105 "pxor %%mm7, %%mm5 \n\t" 1089 "pxor %%mm7, %%mm5 \n\t"
1106 "psubw %%mm6, %%mm4 \n\t" 1090 "psubw %%mm6, %%mm4 \n\t"
1107 "psubw %%mm7, %%mm5 \n\t" 1091 "psubw %%mm7, %%mm5 \n\t"
1108 "packsswb %%mm5, %%mm4 \n\t" 1092 "packsswb %%mm5, %%mm4 \n\t"
1109 "movq (%0), %%mm0 \n\t" 1093 "movq (%0), %%mm0 \n\t"
1110 "paddb %%mm4, %%mm0 \n\t" 1094 "paddb %%mm4, %%mm0 \n\t"
1111 "movq %%mm0, (%0) \n\t" 1095 "movq %%mm0, (%0) \n\t"
1112 "movq (%0, %1), %%mm0 \n\t" 1096 "movq (%0, %1), %%mm0 \n\t"
1113 "psubb %%mm4, %%mm0 \n\t" 1097 "psubb %%mm4, %%mm0 \n\t"
1114 "movq %%mm0, (%0, %1) \n\t" 1098 "movq %%mm0, (%0, %1) \n\t"
1115 1099
1116 : "+r" (src) 1100 : "+r" (src)
1117 : "r" ((long)stride), "m" (c->pQPb) 1101 : "r" ((long)stride), "m" (c->pQPb)
1118 : "%"REG_a, "%"REG_c 1102 : "%"REG_a, "%"REG_c
1119 ); 1103 );
1120 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1104 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1121 const int l1= stride; 1105 const int l1= stride;
1122 const int l2= stride + l1; 1106 const int l2= stride + l1;
1123 const int l3= stride + l2; 1107 const int l3= stride + l2;
1124 const int l4= stride + l3; 1108 const int l4= stride + l3;
1125 const int l5= stride + l4; 1109 const int l5= stride + l4;
1126 const int l6= stride + l5; 1110 const int l6= stride + l5;
1127 const int l7= stride + l6; 1111 const int l7= stride + l6;
1128 const int l8= stride + l7; 1112 const int l8= stride + l7;
1129 // const int l9= stride + l8; 1113 // const int l9= stride + l8;
1130 int x; 1114 int x;
1131 src+= stride*3; 1115 src+= stride*3;
1132 for(x=0; x<BLOCK_SIZE; x++) 1116 for(x=0; x<BLOCK_SIZE; x++){
1133 { 1117 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1134 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 1118 if(FFABS(middleEnergy) < 8*c->QP){
1135 if(FFABS(middleEnergy) < 8*c->QP) 1119 const int q=(src[l4] - src[l5])/2;
1136 { 1120 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1137 const int q=(src[l4] - src[l5])/2; 1121 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1138 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 1122
1139 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 1123 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1140 1124 d= FFMAX(d, 0);
1141 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 1125
1142 d= FFMAX(d, 0); 1126 d= (5*d + 32) >> 6;
1143 1127 d*= FFSIGN(-middleEnergy);
1144 d= (5*d + 32) >> 6; 1128
1145 d*= FFSIGN(-middleEnergy); 1129 if(q>0){
1146 1130 d= d<0 ? 0 : d;
1147 if(q>0) 1131 d= d>q ? q : d;
1148 { 1132 }else{
1149 d= d<0 ? 0 : d; 1133 d= d>0 ? 0 : d;
1150 d= d>q ? q : d; 1134 d= d<q ? q : d;
1151 } 1135 }
1152 else 1136
1153 { 1137 src[l4]-= d;
1154 d= d>0 ? 0 : d; 1138 src[l5]+= d;
1155 d= d<q ? q : d;
1156 }
1157
1158 src[l4]-= d;
1159 src[l5]+= d;
1160 }
1161 src++;
1162 } 1139 }
1140 src++;
1141 }
1163 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1142 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1164 } 1143 }
1165 #endif //HAVE_ALTIVEC 1144 #endif //HAVE_ALTIVEC
1166 1145
1167 #ifndef HAVE_ALTIVEC 1146 #ifndef HAVE_ALTIVEC
1168 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1169 { 1148 {
1170 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1149 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1171 asm volatile( 1150 asm volatile(
1172 "pxor %%mm6, %%mm6 \n\t" 1151 "pxor %%mm6, %%mm6 \n\t"
1173 "pcmpeqb %%mm7, %%mm7 \n\t" 1152 "pcmpeqb %%mm7, %%mm7 \n\t"
1174 "movq %2, %%mm0 \n\t" 1153 "movq %2, %%mm0 \n\t"
1175 "punpcklbw %%mm6, %%mm0 \n\t" 1154 "punpcklbw %%mm6, %%mm0 \n\t"
1176 "psrlw $1, %%mm0 \n\t" 1155 "psrlw $1, %%mm0 \n\t"
1177 "psubw %%mm7, %%mm0 \n\t" 1156 "psubw %%mm7, %%mm0 \n\t"
1178 "packuswb %%mm0, %%mm0 \n\t" 1157 "packuswb %%mm0, %%mm0 \n\t"
1179 "movq %%mm0, %3 \n\t" 1158 "movq %%mm0, %3 \n\t"
1180 1159
1181 "lea (%0, %1), %%"REG_a" \n\t" 1160 "lea (%0, %1), %%"REG_a" \n\t"
1182 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1161 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1183 1162
1184 // 0 1 2 3 4 5 6 7 8 9 1163 // 0 1 2 3 4 5 6 7 8 9
1185 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1186 1165
1187 #undef FIND_MIN_MAX 1166 #undef FIND_MIN_MAX
1188 #ifdef HAVE_MMX2 1167 #ifdef HAVE_MMX2
1189 #define REAL_FIND_MIN_MAX(addr)\ 1168 #define REAL_FIND_MIN_MAX(addr)\
1190 "movq " #addr ", %%mm0 \n\t"\ 1169 "movq " #addr ", %%mm0 \n\t"\
1191 "pminub %%mm0, %%mm7 \n\t"\ 1170 "pminub %%mm0, %%mm7 \n\t"\
1192 "pmaxub %%mm0, %%mm6 \n\t" 1171 "pmaxub %%mm0, %%mm6 \n\t"
1193 #else 1172 #else
1194 #define REAL_FIND_MIN_MAX(addr)\ 1173 #define REAL_FIND_MIN_MAX(addr)\
1195 "movq " #addr ", %%mm0 \n\t"\ 1174 "movq " #addr ", %%mm0 \n\t"\
1196 "movq %%mm7, %%mm1 \n\t"\ 1175 "movq %%mm7, %%mm1 \n\t"\
1197 "psubusb %%mm0, %%mm6 \n\t"\ 1176 "psubusb %%mm0, %%mm6 \n\t"\
1198 "paddb %%mm0, %%mm6 \n\t"\ 1177 "paddb %%mm0, %%mm6 \n\t"\
1199 "psubusb %%mm0, %%mm1 \n\t"\ 1178 "psubusb %%mm0, %%mm1 \n\t"\
1200 "psubb %%mm1, %%mm7 \n\t" 1179 "psubb %%mm1, %%mm7 \n\t"
1201 #endif 1180 #endif
1202 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) 1181 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1203 1182
1204 FIND_MIN_MAX((%%REGa)) 1183 FIND_MIN_MAX((%%REGa))
1205 FIND_MIN_MAX((%%REGa, %1)) 1184 FIND_MIN_MAX((%%REGa, %1))
1208 FIND_MIN_MAX((%%REGd)) 1187 FIND_MIN_MAX((%%REGd))
1209 FIND_MIN_MAX((%%REGd, %1)) 1188 FIND_MIN_MAX((%%REGd, %1))
1210 FIND_MIN_MAX((%%REGd, %1, 2)) 1189 FIND_MIN_MAX((%%REGd, %1, 2))
1211 FIND_MIN_MAX((%0, %1, 8)) 1190 FIND_MIN_MAX((%0, %1, 8))
1212 1191
1213 "movq %%mm7, %%mm4 \n\t" 1192 "movq %%mm7, %%mm4 \n\t"
1214 "psrlq $8, %%mm7 \n\t" 1193 "psrlq $8, %%mm7 \n\t"
1215 #ifdef HAVE_MMX2 1194 #ifdef HAVE_MMX2
1216 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1195 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1217 "pshufw $0xF9, %%mm7, %%mm4 \n\t" 1196 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1218 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1219 "pshufw $0xFE, %%mm7, %%mm4 \n\t" 1198 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1220 "pminub %%mm4, %%mm7 \n\t" 1199 "pminub %%mm4, %%mm7 \n\t"
1221 #else 1200 #else
1222 "movq %%mm7, %%mm1 \n\t" 1201 "movq %%mm7, %%mm1 \n\t"
1223 "psubusb %%mm4, %%mm1 \n\t" 1202 "psubusb %%mm4, %%mm1 \n\t"
1224 "psubb %%mm1, %%mm7 \n\t" 1203 "psubb %%mm1, %%mm7 \n\t"
1225 "movq %%mm7, %%mm4 \n\t" 1204 "movq %%mm7, %%mm4 \n\t"
1226 "psrlq $16, %%mm7 \n\t" 1205 "psrlq $16, %%mm7 \n\t"
1227 "movq %%mm7, %%mm1 \n\t" 1206 "movq %%mm7, %%mm1 \n\t"
1228 "psubusb %%mm4, %%mm1 \n\t" 1207 "psubusb %%mm4, %%mm1 \n\t"
1229 "psubb %%mm1, %%mm7 \n\t" 1208 "psubb %%mm1, %%mm7 \n\t"
1230 "movq %%mm7, %%mm4 \n\t" 1209 "movq %%mm7, %%mm4 \n\t"
1231 "psrlq $32, %%mm7 \n\t" 1210 "psrlq $32, %%mm7 \n\t"
1232 "movq %%mm7, %%mm1 \n\t" 1211 "movq %%mm7, %%mm1 \n\t"
1233 "psubusb %%mm4, %%mm1 \n\t" 1212 "psubusb %%mm4, %%mm1 \n\t"
1234 "psubb %%mm1, %%mm7 \n\t" 1213 "psubb %%mm1, %%mm7 \n\t"
1235 #endif 1214 #endif
1236 1215
1237 1216
1238 "movq %%mm6, %%mm4 \n\t" 1217 "movq %%mm6, %%mm4 \n\t"
1239 "psrlq $8, %%mm6 \n\t" 1218 "psrlq $8, %%mm6 \n\t"
1240 #ifdef HAVE_MMX2 1219 #ifdef HAVE_MMX2
1241 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels 1220 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1242 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 1221 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1243 "pmaxub %%mm4, %%mm6 \n\t" 1222 "pmaxub %%mm4, %%mm6 \n\t"
1244 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 1223 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1245 "pmaxub %%mm4, %%mm6 \n\t" 1224 "pmaxub %%mm4, %%mm6 \n\t"
1246 #else 1225 #else
1247 "psubusb %%mm4, %%mm6 \n\t" 1226 "psubusb %%mm4, %%mm6 \n\t"
1248 "paddb %%mm4, %%mm6 \n\t" 1227 "paddb %%mm4, %%mm6 \n\t"
1249 "movq %%mm6, %%mm4 \n\t" 1228 "movq %%mm6, %%mm4 \n\t"
1250 "psrlq $16, %%mm6 \n\t" 1229 "psrlq $16, %%mm6 \n\t"
1251 "psubusb %%mm4, %%mm6 \n\t" 1230 "psubusb %%mm4, %%mm6 \n\t"
1252 "paddb %%mm4, %%mm6 \n\t" 1231 "paddb %%mm4, %%mm6 \n\t"
1253 "movq %%mm6, %%mm4 \n\t" 1232 "movq %%mm6, %%mm4 \n\t"
1254 "psrlq $32, %%mm6 \n\t" 1233 "psrlq $32, %%mm6 \n\t"
1255 "psubusb %%mm4, %%mm6 \n\t" 1234 "psubusb %%mm4, %%mm6 \n\t"
1256 "paddb %%mm4, %%mm6 \n\t" 1235 "paddb %%mm4, %%mm6 \n\t"
1257 #endif 1236 #endif
1258 "movq %%mm6, %%mm0 \n\t" // max 1237 "movq %%mm6, %%mm0 \n\t" // max
1259 "psubb %%mm7, %%mm6 \n\t" // max - min 1238 "psubb %%mm7, %%mm6 \n\t" // max - min
1260 "movd %%mm6, %%ecx \n\t" 1239 "movd %%mm6, %%ecx \n\t"
1261 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" 1240 "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
1262 " jb 1f \n\t" 1241 " jb 1f \n\t"
1263 "lea -24(%%"REG_SP"), %%"REG_c" \n\t" 1242 "lea -24(%%"REG_SP"), %%"REG_c" \n\t"
1264 "and "ALIGN_MASK", %%"REG_c" \n\t" 1243 "and "ALIGN_MASK", %%"REG_c" \n\t"
1265 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 1244 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1266 "punpcklbw %%mm7, %%mm7 \n\t" 1245 "punpcklbw %%mm7, %%mm7 \n\t"
1267 "punpcklbw %%mm7, %%mm7 \n\t" 1246 "punpcklbw %%mm7, %%mm7 \n\t"
1268 "punpcklbw %%mm7, %%mm7 \n\t" 1247 "punpcklbw %%mm7, %%mm7 \n\t"
1269 "movq %%mm7, (%%"REG_c") \n\t" 1248 "movq %%mm7, (%%"REG_c") \n\t"
1270 1249
1271 "movq (%0), %%mm0 \n\t" // L10 1250 "movq (%0), %%mm0 \n\t" // L10
1272 "movq %%mm0, %%mm1 \n\t" // L10 1251 "movq %%mm0, %%mm1 \n\t" // L10
1273 "movq %%mm0, %%mm2 \n\t" // L10 1252 "movq %%mm0, %%mm2 \n\t" // L10
1274 "psllq $8, %%mm1 \n\t" 1253 "psllq $8, %%mm1 \n\t"
1275 "psrlq $8, %%mm2 \n\t" 1254 "psrlq $8, %%mm2 \n\t"
1276 "movd -4(%0), %%mm3 \n\t" 1255 "movd -4(%0), %%mm3 \n\t"
1277 "movd 8(%0), %%mm4 \n\t" 1256 "movd 8(%0), %%mm4 \n\t"
1278 "psrlq $24, %%mm3 \n\t" 1257 "psrlq $24, %%mm3 \n\t"
1279 "psllq $56, %%mm4 \n\t" 1258 "psllq $56, %%mm4 \n\t"
1280 "por %%mm3, %%mm1 \n\t" // L00 1259 "por %%mm3, %%mm1 \n\t" // L00
1281 "por %%mm4, %%mm2 \n\t" // L20 1260 "por %%mm4, %%mm2 \n\t" // L20
1282 "movq %%mm1, %%mm3 \n\t" // L00 1261 "movq %%mm1, %%mm3 \n\t" // L00
1283 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 1262 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1284 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 1263 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1285 "psubusb %%mm7, %%mm0 \n\t" 1264 "psubusb %%mm7, %%mm0 \n\t"
1286 "psubusb %%mm7, %%mm2 \n\t" 1265 "psubusb %%mm7, %%mm2 \n\t"
1287 "psubusb %%mm7, %%mm3 \n\t" 1266 "psubusb %%mm7, %%mm3 \n\t"
1288 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 1267 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1289 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 1268 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1290 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 1269 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1291 "paddb %%mm2, %%mm0 \n\t" 1270 "paddb %%mm2, %%mm0 \n\t"
1292 "paddb %%mm3, %%mm0 \n\t" 1271 "paddb %%mm3, %%mm0 \n\t"
1293 1272
1294 "movq (%%"REG_a"), %%mm2 \n\t" // L11 1273 "movq (%%"REG_a"), %%mm2 \n\t" // L11
1295 "movq %%mm2, %%mm3 \n\t" // L11 1274 "movq %%mm2, %%mm3 \n\t" // L11
1296 "movq %%mm2, %%mm4 \n\t" // L11 1275 "movq %%mm2, %%mm4 \n\t" // L11
1297 "psllq $8, %%mm3 \n\t" 1276 "psllq $8, %%mm3 \n\t"
1298 "psrlq $8, %%mm4 \n\t" 1277 "psrlq $8, %%mm4 \n\t"
1299 "movd -4(%%"REG_a"), %%mm5 \n\t" 1278 "movd -4(%%"REG_a"), %%mm5 \n\t"
1300 "movd 8(%%"REG_a"), %%mm6 \n\t" 1279 "movd 8(%%"REG_a"), %%mm6 \n\t"
1301 "psrlq $24, %%mm5 \n\t" 1280 "psrlq $24, %%mm5 \n\t"
1302 "psllq $56, %%mm6 \n\t" 1281 "psllq $56, %%mm6 \n\t"
1303 "por %%mm5, %%mm3 \n\t" // L01 1282 "por %%mm5, %%mm3 \n\t" // L01
1304 "por %%mm6, %%mm4 \n\t" // L21 1283 "por %%mm6, %%mm4 \n\t" // L21
1305 "movq %%mm3, %%mm5 \n\t" // L01 1284 "movq %%mm3, %%mm5 \n\t" // L01
1306 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 1285 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1307 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 1286 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1308 "psubusb %%mm7, %%mm2 \n\t" 1287 "psubusb %%mm7, %%mm2 \n\t"
1309 "psubusb %%mm7, %%mm4 \n\t" 1288 "psubusb %%mm7, %%mm4 \n\t"
1310 "psubusb %%mm7, %%mm5 \n\t" 1289 "psubusb %%mm7, %%mm5 \n\t"
1311 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 1290 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1312 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 1291 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1313 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 1292 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1314 "paddb %%mm4, %%mm2 \n\t" 1293 "paddb %%mm4, %%mm2 \n\t"
1315 "paddb %%mm5, %%mm2 \n\t" 1294 "paddb %%mm5, %%mm2 \n\t"
1316 // 0, 2, 3, 1 1295 // 0, 2, 3, 1
1317 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1296 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1318 "movq " #src ", " #sx " \n\t" /* src[0] */\ 1297 "movq " #src ", " #sx " \n\t" /* src[0] */\
1319 "movq " #sx ", " #lx " \n\t" /* src[0] */\ 1298 "movq " #sx ", " #lx " \n\t" /* src[0] */\
1320 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ 1299 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1321 "psllq $8, " #lx " \n\t"\ 1300 "psllq $8, " #lx " \n\t"\
1322 "psrlq $8, " #t0 " \n\t"\ 1301 "psrlq $8, " #t0 " \n\t"\
1323 "movd -4" #src ", " #t1 " \n\t"\ 1302 "movd -4" #src ", " #t1 " \n\t"\
1324 "psrlq $24, " #t1 " \n\t"\ 1303 "psrlq $24, " #t1 " \n\t"\
1325 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ 1304 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1326 "movd 8" #src ", " #t1 " \n\t"\ 1305 "movd 8" #src ", " #t1 " \n\t"\
1327 "psllq $56, " #t1 " \n\t"\ 1306 "psllq $56, " #t1 " \n\t"\
1328 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ 1307 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1329 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ 1308 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1330 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ 1309 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1331 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ 1310 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1332 PAVGB(lx, pplx) \ 1311 PAVGB(lx, pplx) \
1333 "movq " #lx ", 8(%%"REG_c") \n\t"\ 1312 "movq " #lx ", 8(%%"REG_c") \n\t"\
1334 "movq (%%"REG_c"), " #lx " \n\t"\ 1313 "movq (%%"REG_c"), " #lx " \n\t"\
1335 "psubusb " #lx ", " #t1 " \n\t"\ 1314 "psubusb " #lx ", " #t1 " \n\t"\
1336 "psubusb " #lx ", " #t0 " \n\t"\ 1315 "psubusb " #lx ", " #t0 " \n\t"\
1337 "psubusb " #lx ", " #sx " \n\t"\ 1316 "psubusb " #lx ", " #sx " \n\t"\
1338 "movq "MANGLE(b00)", " #lx " \n\t"\ 1317 "movq "MANGLE(b00)", " #lx " \n\t"\
1339 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ 1318 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1340 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ 1319 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1341 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ 1320 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1342 "paddb " #t1 ", " #t0 " \n\t"\ 1321 "paddb " #t1 ", " #t0 " \n\t"\
1343 "paddb " #t0 ", " #sx " \n\t"\ 1322 "paddb " #t0 ", " #sx " \n\t"\
1344 \ 1323 \
1345 PAVGB(plx, pplx) /* filtered */\ 1324 PAVGB(plx, pplx) /* filtered */\
1346 "movq " #dst ", " #t0 " \n\t" /* dst */\ 1325 "movq " #dst ", " #t0 " \n\t" /* dst */\
1347 "movq " #t0 ", " #t1 " \n\t" /* dst */\ 1326 "movq " #t0 ", " #t1 " \n\t" /* dst */\
1348 "psubusb %3, " #t0 " \n\t"\ 1327 "psubusb %3, " #t0 " \n\t"\
1349 "paddusb %3, " #t1 " \n\t"\ 1328 "paddusb %3, " #t1 " \n\t"\
1350 PMAXUB(t0, pplx)\ 1329 PMAXUB(t0, pplx)\
1351 PMINUB(t1, pplx, t0)\ 1330 PMINUB(t1, pplx, t0)\
1352 "paddb " #sx ", " #ppsx " \n\t"\ 1331 "paddb " #sx ", " #ppsx " \n\t"\
1353 "paddb " #psx ", " #ppsx " \n\t"\ 1332 "paddb " #psx ", " #ppsx " \n\t"\
1354 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ 1333 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1355 "pand "MANGLE(b08)", " #ppsx " \n\t"\ 1334 "pand "MANGLE(b08)", " #ppsx " \n\t"\
1356 "pcmpeqb " #lx ", " #ppsx " \n\t"\ 1335 "pcmpeqb " #lx ", " #ppsx " \n\t"\
1357 "pand " #ppsx ", " #pplx " \n\t"\ 1336 "pand " #ppsx ", " #pplx " \n\t"\
1358 "pandn " #dst ", " #ppsx " \n\t"\ 1337 "pandn " #dst ", " #ppsx " \n\t"\
1359 "por " #pplx ", " #ppsx " \n\t"\ 1338 "por " #pplx ", " #ppsx " \n\t"\
1360 "movq " #ppsx ", " #dst " \n\t"\ 1339 "movq " #ppsx ", " #dst " \n\t"\
1361 "movq 8(%%"REG_c"), " #lx " \n\t" 1340 "movq 8(%%"REG_c"), " #lx " \n\t"
1362 1341
1363 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 1342 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1364 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) 1343 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1365 /* 1344 /*
1366 0000000 1345 0000000
1385 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1364 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1386 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 1365 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1387 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 1366 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1388 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 1367 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1389 1368
1390 "1: \n\t" 1369 "1: \n\t"
1391 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2) 1370 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
1392 : "%"REG_a, "%"REG_d, "%"REG_c 1371 : "%"REG_a, "%"REG_d, "%"REG_c
1393 ); 1372 );
1394 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1373 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1395 int y; 1374 int y;
1396 int min=255; 1375 int min=255;
1397 int max=0; 1376 int max=0;
1398 int avg; 1377 int avg;
1399 uint8_t *p; 1378 uint8_t *p;
1400 int s[10]; 1379 int s[10];
1401 const int QP2= c->QP/2 + 1; 1380 const int QP2= c->QP/2 + 1;
1402 1381
1403 for(y=1; y<9; y++) 1382 for(y=1; y<9; y++){
1404 { 1383 int x;
1405 int x; 1384 p= src + stride*y;
1406 p= src + stride*y; 1385 for(x=1; x<9; x++){
1407 for(x=1; x<9; x++) 1386 p++;
1408 { 1387 if(*p > max) max= *p;
1409 p++; 1388 if(*p < min) min= *p;
1410 if(*p > max) max= *p;
1411 if(*p < min) min= *p;
1412 }
1413 } 1389 }
1414 avg= (min + max + 1)>>1; 1390 }
1415 1391 avg= (min + max + 1)>>1;
1416 if(max - min <deringThreshold) return; 1392
1417 1393 if(max - min <deringThreshold) return;
1418 for(y=0; y<10; y++) 1394
1419 { 1395 for(y=0; y<10; y++){
1420 int t = 0; 1396 int t = 0;
1421 1397
1422 if(src[stride*y + 0] > avg) t+= 1; 1398 if(src[stride*y + 0] > avg) t+= 1;
1423 if(src[stride*y + 1] > avg) t+= 2; 1399 if(src[stride*y + 1] > avg) t+= 2;
1424 if(src[stride*y + 2] > avg) t+= 4; 1400 if(src[stride*y + 2] > avg) t+= 4;
1425 if(src[stride*y + 3] > avg) t+= 8; 1401 if(src[stride*y + 3] > avg) t+= 8;
1426 if(src[stride*y + 4] > avg) t+= 16; 1402 if(src[stride*y + 4] > avg) t+= 16;
1427 if(src[stride*y + 5] > avg) t+= 32; 1403 if(src[stride*y + 5] > avg) t+= 32;
1428 if(src[stride*y + 6] > avg) t+= 64; 1404 if(src[stride*y + 6] > avg) t+= 64;
1429 if(src[stride*y + 7] > avg) t+= 128; 1405 if(src[stride*y + 7] > avg) t+= 128;
1430 if(src[stride*y + 8] > avg) t+= 256; 1406 if(src[stride*y + 8] > avg) t+= 256;
1431 if(src[stride*y + 9] > avg) t+= 512; 1407 if(src[stride*y + 9] > avg) t+= 512;
1432 1408
1433 t |= (~t)<<16; 1409 t |= (~t)<<16;
1434 t &= (t<<1) & (t>>1); 1410 t &= (t<<1) & (t>>1);
1435 s[y] = t; 1411 s[y] = t;
1412 }
1413
1414 for(y=1; y<9; y++){
1415 int t = s[y-1] & s[y] & s[y+1];
1416 t|= t>>16;
1417 s[y-1]= t;
1418 }
1419
1420 for(y=1; y<9; y++){
1421 int x;
1422 int t = s[y-1];
1423
1424 p= src + stride*y;
1425 for(x=1; x<9; x++){
1426 p++;
1427 if(t & (1<<x)){
1428 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1429 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1430 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1431 f= (f + 8)>>4;
1432
1433 #ifdef DEBUG_DERING_THRESHOLD
1434 asm volatile("emms\n\t":);
1435 {
1436 static long long numPixels=0;
1437 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1438 // if((max-min)<20 || (max-min)*QP<200)
1439 // if((max-min)*QP < 500)
1440 // if(max-min<QP/2)
1441 if(max-min < 20){
1442 static int numSkiped=0;
1443 static int errorSum=0;
1444 static int worstQP=0;
1445 static int worstRange=0;
1446 static int worstDiff=0;
1447 int diff= (f - *p);
1448 int absDiff= FFABS(diff);
1449 int error= diff*diff;
1450
1451 if(x==1 || x==8 || y==1 || y==8) continue;
1452
1453 numSkiped++;
1454 if(absDiff > worstDiff){
1455 worstDiff= absDiff;
1456 worstQP= QP;
1457 worstRange= max-min;
1458 }
1459 errorSum+= error;
1460
1461 if(1024LL*1024LL*1024LL % numSkiped == 0){
1462 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1463 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1464 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1465 worstDiff, (float)numSkiped/numPixels);
1466 }
1467 }
1468 }
1469 #endif
1470 if (*p + QP2 < f) *p= *p + QP2;
1471 else if(*p - QP2 > f) *p= *p - QP2;
1472 else *p=f;
1473 }
1436 } 1474 }
1437 1475 }
1438 for(y=1; y<9; y++) 1476 #ifdef DEBUG_DERING_THRESHOLD
1439 { 1477 if(max-min < 20){
1440 int t = s[y-1] & s[y] & s[y+1]; 1478 for(y=1; y<9; y++){
1441 t|= t>>16; 1479 int x;
1442 s[y-1]= t; 1480 int t = 0;
1481 p= src + stride*y;
1482 for(x=1; x<9; x++){
1483 p++;
1484 *p = FFMIN(*p + 20, 255);
1485 }
1443 } 1486 }
1444 1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1445 for(y=1; y<9; y++) 1488 }
1446 {
1447 int x;
1448 int t = s[y-1];
1449
1450 p= src + stride*y;
1451 for(x=1; x<9; x++)
1452 {
1453 p++;
1454 if(t & (1<<x))
1455 {
1456 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1457 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1458 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1459 f= (f + 8)>>4;
1460
1461 #ifdef DEBUG_DERING_THRESHOLD
1462 asm volatile("emms\n\t":);
1463 {
1464 static long long numPixels=0;
1465 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1466 // if((max-min)<20 || (max-min)*QP<200)
1467 // if((max-min)*QP < 500)
1468 // if(max-min<QP/2)
1469 if(max-min < 20)
1470 {
1471 static int numSkiped=0;
1472 static int errorSum=0;
1473 static int worstQP=0;
1474 static int worstRange=0;
1475 static int worstDiff=0;
1476 int diff= (f - *p);
1477 int absDiff= FFABS(diff);
1478 int error= diff*diff;
1479
1480 if(x==1 || x==8 || y==1 || y==8) continue;
1481
1482 numSkiped++;
1483 if(absDiff > worstDiff)
1484 {
1485 worstDiff= absDiff;
1486 worstQP= QP;
1487 worstRange= max-min;
1488 }
1489 errorSum+= error;
1490
1491 if(1024LL*1024LL*1024LL % numSkiped == 0)
1492 {
1493 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1494 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1495 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1496 worstDiff, (float)numSkiped/numPixels);
1497 }
1498 }
1499 }
1500 #endif
1501 if (*p + QP2 < f) *p= *p + QP2;
1502 else if(*p - QP2 > f) *p= *p - QP2;
1503 else *p=f;
1504 }
1505 }
1506 }
1507 #ifdef DEBUG_DERING_THRESHOLD
1508 if(max-min < 20)
1509 {
1510 for(y=1; y<9; y++)
1511 {
1512 int x;
1513 int t = 0;
1514 p= src + stride*y;
1515 for(x=1; x<9; x++)
1516 {
1517 p++;
1518 *p = FFMIN(*p + 20, 255);
1519 }
1520 }
1521 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1522 }
1523 #endif 1489 #endif
1524 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1490 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1525 } 1491 }
1526 #endif //HAVE_ALTIVEC 1492 #endif //HAVE_ALTIVEC
1527 1493
1532 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1533 */ 1499 */
1534 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1535 { 1501 {
1536 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1502 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1537 src+= 4*stride; 1503 src+= 4*stride;
1538 asm volatile( 1504 asm volatile(
1539 "lea (%0, %1), %%"REG_a" \n\t" 1505 "lea (%0, %1), %%"REG_a" \n\t"
1540 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
1541 // 0 1 2 3 4 5 6 7 8 9 1507 // 0 1 2 3 4 5 6 7 8 9
1542 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 1508 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1543 1509
1544 "movq (%0), %%mm0 \n\t" 1510 "movq (%0), %%mm0 \n\t"
1545 "movq (%%"REG_a", %1), %%mm1 \n\t" 1511 "movq (%%"REG_a", %1), %%mm1 \n\t"
1546 PAVGB(%%mm1, %%mm0) 1512 PAVGB(%%mm1, %%mm0)
1547 "movq %%mm0, (%%"REG_a") \n\t" 1513 "movq %%mm0, (%%"REG_a") \n\t"
1548 "movq (%0, %1, 4), %%mm0 \n\t" 1514 "movq (%0, %1, 4), %%mm0 \n\t"
1549 PAVGB(%%mm0, %%mm1) 1515 PAVGB(%%mm0, %%mm1)
1550 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" 1516 "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
1551 "movq (%%"REG_c", %1), %%mm1 \n\t" 1517 "movq (%%"REG_c", %1), %%mm1 \n\t"
1552 PAVGB(%%mm1, %%mm0) 1518 PAVGB(%%mm1, %%mm0)
1553 "movq %%mm0, (%%"REG_c") \n\t" 1519 "movq %%mm0, (%%"REG_c") \n\t"
1554 "movq (%0, %1, 8), %%mm0 \n\t" 1520 "movq (%0, %1, 8), %%mm0 \n\t"
1555 PAVGB(%%mm0, %%mm1) 1521 PAVGB(%%mm0, %%mm1)
1556 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" 1522 "movq %%mm1, (%%"REG_c", %1, 2) \n\t"
1557 1523
1558 : : "r" (src), "r" ((long)stride) 1524 : : "r" (src), "r" ((long)stride)
1559 : "%"REG_a, "%"REG_c 1525 : "%"REG_a, "%"REG_c
1560 ); 1526 );
1561 #else 1527 #else
1562 int a, b, x; 1528 int a, b, x;
1563 src+= 4*stride; 1529 src+= 4*stride;
1564 1530
1565 for(x=0; x<2; x++){ 1531 for(x=0; x<2; x++){
1566 a= *(uint32_t*)&src[stride*0]; 1532 a= *(uint32_t*)&src[stride*0];
1567 b= *(uint32_t*)&src[stride*2]; 1533 b= *(uint32_t*)&src[stride*2];
1568 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1534 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1569 a= *(uint32_t*)&src[stride*4]; 1535 a= *(uint32_t*)&src[stride*4];
1570 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1536 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1571 b= *(uint32_t*)&src[stride*6]; 1537 b= *(uint32_t*)&src[stride*6];
1572 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1538 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1573 a= *(uint32_t*)&src[stride*8]; 1539 a= *(uint32_t*)&src[stride*8];
1574 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1540 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1575 src += 4; 1541 src += 4;
1576 } 1542 }
1577 #endif 1543 #endif
1578 } 1544 }
1579 1545
1580 /** 1546 /**
1581 * Deinterlaces the given block by cubic interpolating every second line. 1547 * Deinterlaces the given block by cubic interpolating every second line.
1585 * this filter will read lines 3-15 and write 7-13 1551 * this filter will read lines 3-15 and write 7-13
1586 */ 1552 */
1587 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1588 { 1554 {
1589 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1590 src+= stride*3; 1556 src+= stride*3;
1591 asm volatile( 1557 asm volatile(
1592 "lea (%0, %1), %%"REG_a" \n\t" 1558 "lea (%0, %1), %%"REG_a" \n\t"
1593 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1594 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
1595 "add %1, %%"REG_c" \n\t" 1561 "add %1, %%"REG_c" \n\t"
1596 "pxor %%mm7, %%mm7 \n\t" 1562 "pxor %%mm7, %%mm7 \n\t"
1597 // 0 1 2 3 4 5 6 7 8 9 10 1563 // 0 1 2 3 4 5 6 7 8 9 10
1598 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1564 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1599 1565
1600 #define REAL_DEINT_CUBIC(a,b,c,d,e)\ 1566 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1601 "movq " #a ", %%mm0 \n\t"\ 1567 "movq " #a ", %%mm0 \n\t"\
1602 "movq " #b ", %%mm1 \n\t"\ 1568 "movq " #b ", %%mm1 \n\t"\
1603 "movq " #d ", %%mm2 \n\t"\ 1569 "movq " #d ", %%mm2 \n\t"\
1604 "movq " #e ", %%mm3 \n\t"\ 1570 "movq " #e ", %%mm3 \n\t"\
1605 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ 1571 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1606 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ 1572 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
1607 "movq %%mm0, %%mm2 \n\t"\ 1573 "movq %%mm0, %%mm2 \n\t"\
1608 "punpcklbw %%mm7, %%mm0 \n\t"\ 1574 "punpcklbw %%mm7, %%mm0 \n\t"\
1609 "punpckhbw %%mm7, %%mm2 \n\t"\ 1575 "punpckhbw %%mm7, %%mm2 \n\t"\
1610 "movq %%mm1, %%mm3 \n\t"\ 1576 "movq %%mm1, %%mm3 \n\t"\
1611 "punpcklbw %%mm7, %%mm1 \n\t"\ 1577 "punpcklbw %%mm7, %%mm1 \n\t"\
1612 "punpckhbw %%mm7, %%mm3 \n\t"\ 1578 "punpckhbw %%mm7, %%mm3 \n\t"\
1613 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ 1579 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1614 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ 1580 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1615 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ 1581 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1616 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ 1582 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1617 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ 1583 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1618 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ 1584 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1619 "packuswb %%mm3, %%mm1 \n\t"\ 1585 "packuswb %%mm3, %%mm1 \n\t"\
1620 "movq %%mm1, " #c " \n\t" 1586 "movq %%mm1, " #c " \n\t"
1621 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) 1587 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1622 1588
1623 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1)) 1589 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
1624 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8)) 1590 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
1625 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc)) 1591 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1626 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) 1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
1627 1593
1628 : : "r" (src), "r" ((long)stride) 1594 : : "r" (src), "r" ((long)stride)
1629 : "%"REG_a, "%"REG_d, "%"REG_c 1595 : "%"REG_a, "%"REG_d, "%"REG_c
1630 ); 1596 );
1631 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1597 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1632 int x; 1598 int x;
1633 src+= stride*3; 1599 src+= stride*3;
1634 for(x=0; x<8; x++) 1600 for(x=0; x<8; x++){
1635 { 1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1636 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1637 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1638 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1639 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 1605 src++;
1640 src++; 1606 }
1641 }
1642 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1607 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1643 } 1608 }
1644 1609
1645 /** 1610 /**
1646 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. 1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1650 * this filter will read lines 4-13 and write 5-11 1615 * this filter will read lines 4-13 and write 5-11
1651 */ 1616 */
1652 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1653 { 1618 {
1654 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1619 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1655 src+= stride*4; 1620 src+= stride*4;
1656 asm volatile( 1621 asm volatile(
1657 "lea (%0, %1), %%"REG_a" \n\t" 1622 "lea (%0, %1), %%"REG_a" \n\t"
1658 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1659 "pxor %%mm7, %%mm7 \n\t" 1624 "pxor %%mm7, %%mm7 \n\t"
1660 "movq (%2), %%mm0 \n\t" 1625 "movq (%2), %%mm0 \n\t"
1661 // 0 1 2 3 4 5 6 7 8 9 10 1626 // 0 1 2 3 4 5 6 7 8 9 10
1662 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1627 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1663 1628
1664 #define REAL_DEINT_FF(a,b,c,d)\ 1629 #define REAL_DEINT_FF(a,b,c,d)\
1665 "movq " #a ", %%mm1 \n\t"\ 1630 "movq " #a ", %%mm1 \n\t"\
1666 "movq " #b ", %%mm2 \n\t"\ 1631 "movq " #b ", %%mm2 \n\t"\
1667 "movq " #c ", %%mm3 \n\t"\ 1632 "movq " #c ", %%mm3 \n\t"\
1668 "movq " #d ", %%mm4 \n\t"\ 1633 "movq " #d ", %%mm4 \n\t"\
1669 PAVGB(%%mm3, %%mm1) \ 1634 PAVGB(%%mm3, %%mm1) \
1670 PAVGB(%%mm4, %%mm0) \ 1635 PAVGB(%%mm4, %%mm0) \
1671 "movq %%mm0, %%mm3 \n\t"\ 1636 "movq %%mm0, %%mm3 \n\t"\
1672 "punpcklbw %%mm7, %%mm0 \n\t"\ 1637 "punpcklbw %%mm7, %%mm0 \n\t"\
1673 "punpckhbw %%mm7, %%mm3 \n\t"\ 1638 "punpckhbw %%mm7, %%mm3 \n\t"\
1674 "movq %%mm1, %%mm4 \n\t"\ 1639 "movq %%mm1, %%mm4 \n\t"\
1675 "punpcklbw %%mm7, %%mm1 \n\t"\ 1640 "punpcklbw %%mm7, %%mm1 \n\t"\
1676 "punpckhbw %%mm7, %%mm4 \n\t"\ 1641 "punpckhbw %%mm7, %%mm4 \n\t"\
1677 "psllw $2, %%mm1 \n\t"\ 1642 "psllw $2, %%mm1 \n\t"\
1678 "psllw $2, %%mm4 \n\t"\ 1643 "psllw $2, %%mm4 \n\t"\
1679 "psubw %%mm0, %%mm1 \n\t"\ 1644 "psubw %%mm0, %%mm1 \n\t"\
1680 "psubw %%mm3, %%mm4 \n\t"\ 1645 "psubw %%mm3, %%mm4 \n\t"\
1681 "movq %%mm2, %%mm5 \n\t"\ 1646 "movq %%mm2, %%mm5 \n\t"\
1682 "movq %%mm2, %%mm0 \n\t"\ 1647 "movq %%mm2, %%mm0 \n\t"\
1683 "punpcklbw %%mm7, %%mm2 \n\t"\ 1648 "punpcklbw %%mm7, %%mm2 \n\t"\
1684 "punpckhbw %%mm7, %%mm5 \n\t"\ 1649 "punpckhbw %%mm7, %%mm5 \n\t"\
1685 "paddw %%mm2, %%mm1 \n\t"\ 1650 "paddw %%mm2, %%mm1 \n\t"\
1686 "paddw %%mm5, %%mm4 \n\t"\ 1651 "paddw %%mm5, %%mm4 \n\t"\
1687 "psraw $2, %%mm1 \n\t"\ 1652 "psraw $2, %%mm1 \n\t"\
1688 "psraw $2, %%mm4 \n\t"\ 1653 "psraw $2, %%mm4 \n\t"\
1689 "packuswb %%mm4, %%mm1 \n\t"\ 1654 "packuswb %%mm4, %%mm1 \n\t"\
1690 "movq %%mm1, " #b " \n\t"\ 1655 "movq %%mm1, " #b " \n\t"\
1691 1656
1692 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) 1657 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1693 1658
1694 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) 1659 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
1695 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 1660 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
1696 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) 1661 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
1697 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1662 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1698 1663
1699 "movq %%mm0, (%2) \n\t" 1664 "movq %%mm0, (%2) \n\t"
1700 : : "r" (src), "r" ((long)stride), "r"(tmp) 1665 : : "r" (src), "r" ((long)stride), "r"(tmp)
1701 : "%"REG_a, "%"REG_d 1666 : "%"REG_a, "%"REG_d
1702 ); 1667 );
1703 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1668 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1704 int x; 1669 int x;
1705 src+= stride*4; 1670 src+= stride*4;
1706 for(x=0; x<8; x++) 1671 for(x=0; x<8; x++){
1707 { 1672 int t1= tmp[x];
1708 int t1= tmp[x]; 1673 int t2= src[stride*1];
1709 int t2= src[stride*1]; 1674
1710 1675 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1711 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); 1676 t1= src[stride*4];
1712 t1= src[stride*4]; 1677 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1713 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); 1678 t2= src[stride*6];
1714 t2= src[stride*6]; 1679 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1715 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); 1680 t1= src[stride*8];
1716 t1= src[stride*8]; 1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1717 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 1682 tmp[x]= t1;
1718 tmp[x]= t1; 1683
1719 1684 src++;
1720 src++; 1685 }
1721 }
1722 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1686 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1723 } 1687 }
1724 1688
1725 /** 1689 /**
1726 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. 1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1730 * this filter will read lines 4-13 and write 4-11 1694 * this filter will read lines 4-13 and write 4-11
1731 */ 1695 */
1732 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1733 { 1697 {
1734 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1698 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1735 src+= stride*4; 1699 src+= stride*4;
1736 asm volatile( 1700 asm volatile(
1737 "lea (%0, %1), %%"REG_a" \n\t" 1701 "lea (%0, %1), %%"REG_a" \n\t"
1738 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1739 "pxor %%mm7, %%mm7 \n\t" 1703 "pxor %%mm7, %%mm7 \n\t"
1740 "movq (%2), %%mm0 \n\t" 1704 "movq (%2), %%mm0 \n\t"
1741 "movq (%3), %%mm1 \n\t" 1705 "movq (%3), %%mm1 \n\t"
1742 // 0 1 2 3 4 5 6 7 8 9 10 1706 // 0 1 2 3 4 5 6 7 8 9 10
1743 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 1707 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1744 1708
1745 #define REAL_DEINT_L5(t1,t2,a,b,c)\ 1709 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1746 "movq " #a ", %%mm2 \n\t"\ 1710 "movq " #a ", %%mm2 \n\t"\
1747 "movq " #b ", %%mm3 \n\t"\ 1711 "movq " #b ", %%mm3 \n\t"\
1748 "movq " #c ", %%mm4 \n\t"\ 1712 "movq " #c ", %%mm4 \n\t"\
1749 PAVGB(t2, %%mm3) \ 1713 PAVGB(t2, %%mm3) \
1750 PAVGB(t1, %%mm4) \ 1714 PAVGB(t1, %%mm4) \
1751 "movq %%mm2, %%mm5 \n\t"\ 1715 "movq %%mm2, %%mm5 \n\t"\
1752 "movq %%mm2, " #t1 " \n\t"\ 1716 "movq %%mm2, " #t1 " \n\t"\
1753 "punpcklbw %%mm7, %%mm2 \n\t"\ 1717 "punpcklbw %%mm7, %%mm2 \n\t"\
1754 "punpckhbw %%mm7, %%mm5 \n\t"\ 1718 "punpckhbw %%mm7, %%mm5 \n\t"\
1755 "movq %%mm2, %%mm6 \n\t"\ 1719 "movq %%mm2, %%mm6 \n\t"\
1756 "paddw %%mm2, %%mm2 \n\t"\ 1720 "paddw %%mm2, %%mm2 \n\t"\
1757 "paddw %%mm6, %%mm2 \n\t"\ 1721 "paddw %%mm6, %%mm2 \n\t"\
1758 "movq %%mm5, %%mm6 \n\t"\ 1722 "movq %%mm5, %%mm6 \n\t"\
1759 "paddw %%mm5, %%mm5 \n\t"\ 1723 "paddw %%mm5, %%mm5 \n\t"\
1760 "paddw %%mm6, %%mm5 \n\t"\ 1724 "paddw %%mm6, %%mm5 \n\t"\
1761 "movq %%mm3, %%mm6 \n\t"\ 1725 "movq %%mm3, %%mm6 \n\t"\
1762 "punpcklbw %%mm7, %%mm3 \n\t"\ 1726 "punpcklbw %%mm7, %%mm3 \n\t"\
1763 "punpckhbw %%mm7, %%mm6 \n\t"\ 1727 "punpckhbw %%mm7, %%mm6 \n\t"\
1764 "paddw %%mm3, %%mm3 \n\t"\ 1728 "paddw %%mm3, %%mm3 \n\t"\
1765 "paddw %%mm6, %%mm6 \n\t"\ 1729 "paddw %%mm6, %%mm6 \n\t"\
1766 "paddw %%mm3, %%mm2 \n\t"\ 1730 "paddw %%mm3, %%mm2 \n\t"\
1767 "paddw %%mm6, %%mm5 \n\t"\ 1731 "paddw %%mm6, %%mm5 \n\t"\
1768 "movq %%mm4, %%mm6 \n\t"\ 1732 "movq %%mm4, %%mm6 \n\t"\
1769 "punpcklbw %%mm7, %%mm4 \n\t"\ 1733 "punpcklbw %%mm7, %%mm4 \n\t"\
1770 "punpckhbw %%mm7, %%mm6 \n\t"\ 1734 "punpckhbw %%mm7, %%mm6 \n\t"\
1771 "psubw %%mm4, %%mm2 \n\t"\ 1735 "psubw %%mm4, %%mm2 \n\t"\
1772 "psubw %%mm6, %%mm5 \n\t"\ 1736 "psubw %%mm6, %%mm5 \n\t"\
1773 "psraw $2, %%mm2 \n\t"\ 1737 "psraw $2, %%mm2 \n\t"\
1774 "psraw $2, %%mm5 \n\t"\ 1738 "psraw $2, %%mm5 \n\t"\
1775 "packuswb %%mm5, %%mm2 \n\t"\ 1739 "packuswb %%mm5, %%mm2 \n\t"\
1776 "movq %%mm2, " #a " \n\t"\ 1740 "movq %%mm2, " #a " \n\t"\
1777 1741
1778 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) 1742 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1779 1743
1780 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) 1744 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
1781 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) 1745 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
1784 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) 1748 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
1785 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) 1749 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
1786 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) 1750 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
1787 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 1751 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1788 1752
1789 "movq %%mm0, (%2) \n\t" 1753 "movq %%mm0, (%2) \n\t"
1790 "movq %%mm1, (%3) \n\t" 1754 "movq %%mm1, (%3) \n\t"
1791 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2) 1755 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
1792 : "%"REG_a, "%"REG_d 1756 : "%"REG_a, "%"REG_d
1793 ); 1757 );
1794 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1758 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1795 int x; 1759 int x;
1796 src+= stride*4; 1760 src+= stride*4;
1797 for(x=0; x<8; x++) 1761 for(x=0; x<8; x++){
1798 { 1762 int t1= tmp[x];
1799 int t1= tmp[x]; 1763 int t2= tmp2[x];
1800 int t2= tmp2[x]; 1764 int t3= src[0];
1801 int t3= src[0]; 1765
1802 1766 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1803 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); 1767 t1= src[stride*1];
1804 t1= src[stride*1]; 1768 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1805 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); 1769 t2= src[stride*2];
1806 t2= src[stride*2]; 1770 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1807 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); 1771 t3= src[stride*3];
1808 t3= src[stride*3]; 1772 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1809 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); 1773 t1= src[stride*4];
1810 t1= src[stride*4]; 1774 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1811 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); 1775 t2= src[stride*5];
1812 t2= src[stride*5]; 1776 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1813 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); 1777 t3= src[stride*6];
1814 t3= src[stride*6]; 1778 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1815 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); 1779 t1= src[stride*7];
1816 t1= src[stride*7]; 1780 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1817 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); 1781
1818 1782 tmp[x]= t3;
1819 tmp[x]= t3; 1783 tmp2[x]= t1;
1820 tmp2[x]= t1; 1784
1821 1785 src++;
1822 src++; 1786 }
1823 }
1824 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1787 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1825 } 1788 }
1826 1789
1827 /** 1790 /**
1828 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. 1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1832 * this filter will read lines 4-13 and write 4-11 1795 * this filter will read lines 4-13 and write 4-11
1833 */ 1796 */
1834 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1835 { 1798 {
1836 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1799 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1837 src+= 4*stride; 1800 src+= 4*stride;
1838 asm volatile( 1801 asm volatile(
1839 "lea (%0, %1), %%"REG_a" \n\t" 1802 "lea (%0, %1), %%"REG_a" \n\t"
1840 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1841 // 0 1 2 3 4 5 6 7 8 9 1804 // 0 1 2 3 4 5 6 7 8 9
1842 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1805 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1843 1806
1844 "movq (%2), %%mm0 \n\t" // L0 1807 "movq (%2), %%mm0 \n\t" // L0
1845 "movq (%%"REG_a"), %%mm1 \n\t" // L2 1808 "movq (%%"REG_a"), %%mm1 \n\t" // L2
1846 PAVGB(%%mm1, %%mm0) // L0+L2 1809 PAVGB(%%mm1, %%mm0) // L0+L2
1847 "movq (%0), %%mm2 \n\t" // L1 1810 "movq (%0), %%mm2 \n\t" // L1
1848 PAVGB(%%mm2, %%mm0) 1811 PAVGB(%%mm2, %%mm0)
1849 "movq %%mm0, (%0) \n\t" 1812 "movq %%mm0, (%0) \n\t"
1850 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 1813 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3
1851 PAVGB(%%mm0, %%mm2) // L1+L3 1814 PAVGB(%%mm0, %%mm2) // L1+L3
1852 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 1815 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1853 "movq %%mm2, (%%"REG_a") \n\t" 1816 "movq %%mm2, (%%"REG_a") \n\t"
1854 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 1817 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4
1855 PAVGB(%%mm2, %%mm1) // L2+L4 1818 PAVGB(%%mm2, %%mm1) // L2+L4
1856 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 1819 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1857 "movq %%mm1, (%%"REG_a", %1) \n\t" 1820 "movq %%mm1, (%%"REG_a", %1) \n\t"
1858 "movq (%0, %1, 4), %%mm1 \n\t" // L5 1821 "movq (%0, %1, 4), %%mm1 \n\t" // L5
1859 PAVGB(%%mm1, %%mm0) // L3+L5 1822 PAVGB(%%mm1, %%mm0) // L3+L5
1860 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 1823 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1861 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 1824 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
1862 "movq (%%"REG_d"), %%mm0 \n\t" // L6 1825 "movq (%%"REG_d"), %%mm0 \n\t" // L6
1863 PAVGB(%%mm0, %%mm2) // L4+L6 1826 PAVGB(%%mm0, %%mm2) // L4+L6
1864 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 1827 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1865 "movq %%mm2, (%0, %1, 4) \n\t" 1828 "movq %%mm2, (%0, %1, 4) \n\t"
1866 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 1829 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7
1867 PAVGB(%%mm2, %%mm1) // L5+L7 1830 PAVGB(%%mm2, %%mm1) // L5+L7
1868 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 1831 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1869 "movq %%mm1, (%%"REG_d") \n\t" 1832 "movq %%mm1, (%%"REG_d") \n\t"
1870 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 1833 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8
1871 PAVGB(%%mm1, %%mm0) // L6+L8 1834 PAVGB(%%mm1, %%mm0) // L6+L8
1872 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 1835 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1873 "movq %%mm0, (%%"REG_d", %1) \n\t" 1836 "movq %%mm0, (%%"REG_d", %1) \n\t"
1874 "movq (%0, %1, 8), %%mm0 \n\t" // L9 1837 "movq (%0, %1, 8), %%mm0 \n\t" // L9
1875 PAVGB(%%mm0, %%mm2) // L7+L9 1838 PAVGB(%%mm0, %%mm2) // L7+L9
1876 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 1839 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1877 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1840 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1878 "movq %%mm1, (%2) \n\t" 1841 "movq %%mm1, (%2) \n\t"
1879 1842
1880 : : "r" (src), "r" ((long)stride), "r" (tmp) 1843 : : "r" (src), "r" ((long)stride), "r" (tmp)
1881 : "%"REG_a, "%"REG_d 1844 : "%"REG_a, "%"REG_d
1882 ); 1845 );
1883 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1846 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1884 int a, b, c, x; 1847 int a, b, c, x;
1885 src+= 4*stride; 1848 src+= 4*stride;
1886 1849
1887 for(x=0; x<2; x++){ 1850 for(x=0; x<2; x++){
1888 a= *(uint32_t*)&tmp[stride*0]; 1851 a= *(uint32_t*)&tmp[stride*0];
1889 b= *(uint32_t*)&src[stride*0]; 1852 b= *(uint32_t*)&src[stride*0];
1890 c= *(uint32_t*)&src[stride*1]; 1853 c= *(uint32_t*)&src[stride*1];
1891 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1854 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1892 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1855 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1893 1856
1894 a= *(uint32_t*)&src[stride*2]; 1857 a= *(uint32_t*)&src[stride*2];
1895 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1858 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1896 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1859 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1897 1860
1898 b= *(uint32_t*)&src[stride*3]; 1861 b= *(uint32_t*)&src[stride*3];
1899 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1862 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1900 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1863 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1901 1864
1902 c= *(uint32_t*)&src[stride*4]; 1865 c= *(uint32_t*)&src[stride*4];
1903 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1866 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1904 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1867 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1905 1868
1906 a= *(uint32_t*)&src[stride*5]; 1869 a= *(uint32_t*)&src[stride*5];
1907 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1870 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1908 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1871 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1909 1872
1910 b= *(uint32_t*)&src[stride*6]; 1873 b= *(uint32_t*)&src[stride*6];
1911 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 1874 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1912 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 1875 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1913 1876
1914 c= *(uint32_t*)&src[stride*7]; 1877 c= *(uint32_t*)&src[stride*7];
1915 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 1878 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1916 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 1879 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1917 1880
1918 a= *(uint32_t*)&src[stride*8]; 1881 a= *(uint32_t*)&src[stride*8];
1919 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 1882 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1920 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 1883 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1921 1884
1922 *(uint32_t*)&tmp[stride*0]= c; 1885 *(uint32_t*)&tmp[stride*0]= c;
1923 src += 4; 1886 src += 4;
1924 tmp += 4; 1887 tmp += 4;
1925 } 1888 }
1926 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1889 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1927 } 1890 }
1928 1891
1929 /** 1892 /**
1930 * Deinterlaces the given block by applying a median filter to every second line. 1893 * Deinterlaces the given block by applying a median filter to every second line.
1933 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1896 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1934 */ 1897 */
1935 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) 1898 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1936 { 1899 {
1937 #ifdef HAVE_MMX 1900 #ifdef HAVE_MMX
1938 src+= 4*stride; 1901 src+= 4*stride;
1939 #ifdef HAVE_MMX2 1902 #ifdef HAVE_MMX2
1940 asm volatile( 1903 asm volatile(
1941 "lea (%0, %1), %%"REG_a" \n\t" 1904 "lea (%0, %1), %%"REG_a" \n\t"
1942 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1905 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1943 // 0 1 2 3 4 5 6 7 8 9 1906 // 0 1 2 3 4 5 6 7 8 9
1944 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1907 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1945 1908
1946 "movq (%0), %%mm0 \n\t" // 1909 "movq (%0), %%mm0 \n\t" //
1947 "movq (%%"REG_a", %1), %%mm2 \n\t" // 1910 "movq (%%"REG_a", %1), %%mm2 \n\t" //
1948 "movq (%%"REG_a"), %%mm1 \n\t" // 1911 "movq (%%"REG_a"), %%mm1 \n\t" //
1949 "movq %%mm0, %%mm3 \n\t" 1912 "movq %%mm0, %%mm3 \n\t"
1950 "pmaxub %%mm1, %%mm0 \n\t" // 1913 "pmaxub %%mm1, %%mm0 \n\t" //
1951 "pminub %%mm3, %%mm1 \n\t" // 1914 "pminub %%mm3, %%mm1 \n\t" //
1952 "pmaxub %%mm2, %%mm1 \n\t" // 1915 "pmaxub %%mm2, %%mm1 \n\t" //
1953 "pminub %%mm1, %%mm0 \n\t" 1916 "pminub %%mm1, %%mm0 \n\t"
1954 "movq %%mm0, (%%"REG_a") \n\t" 1917 "movq %%mm0, (%%"REG_a") \n\t"
1955 1918
1956 "movq (%0, %1, 4), %%mm0 \n\t" // 1919 "movq (%0, %1, 4), %%mm0 \n\t" //
1957 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // 1920 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" //
1958 "movq %%mm2, %%mm3 \n\t" 1921 "movq %%mm2, %%mm3 \n\t"
1959 "pmaxub %%mm1, %%mm2 \n\t" // 1922 "pmaxub %%mm1, %%mm2 \n\t" //
1960 "pminub %%mm3, %%mm1 \n\t" // 1923 "pminub %%mm3, %%mm1 \n\t" //
1961 "pmaxub %%mm0, %%mm1 \n\t" // 1924 "pmaxub %%mm0, %%mm1 \n\t" //
1962 "pminub %%mm1, %%mm2 \n\t" 1925 "pminub %%mm1, %%mm2 \n\t"
1963 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 1926 "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
1964 1927
1965 "movq (%%"REG_d"), %%mm2 \n\t" // 1928 "movq (%%"REG_d"), %%mm2 \n\t" //
1966 "movq (%%"REG_d", %1), %%mm1 \n\t" // 1929 "movq (%%"REG_d", %1), %%mm1 \n\t" //
1967 "movq %%mm2, %%mm3 \n\t" 1930 "movq %%mm2, %%mm3 \n\t"
1968 "pmaxub %%mm0, %%mm2 \n\t" // 1931 "pmaxub %%mm0, %%mm2 \n\t" //
1969 "pminub %%mm3, %%mm0 \n\t" // 1932 "pminub %%mm3, %%mm0 \n\t" //
1970 "pmaxub %%mm1, %%mm0 \n\t" // 1933 "pmaxub %%mm1, %%mm0 \n\t" //
1971 "pminub %%mm0, %%mm2 \n\t" 1934 "pminub %%mm0, %%mm2 \n\t"
1972 "movq %%mm2, (%%"REG_d") \n\t" 1935 "movq %%mm2, (%%"REG_d") \n\t"
1973 1936
1974 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // 1937 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" //
1975 "movq (%0, %1, 8), %%mm0 \n\t" // 1938 "movq (%0, %1, 8), %%mm0 \n\t" //
1976 "movq %%mm2, %%mm3 \n\t" 1939 "movq %%mm2, %%mm3 \n\t"
1977 "pmaxub %%mm0, %%mm2 \n\t" // 1940 "pmaxub %%mm0, %%mm2 \n\t" //
1978 "pminub %%mm3, %%mm0 \n\t" // 1941 "pminub %%mm3, %%mm0 \n\t" //
1979 "pmaxub %%mm1, %%mm0 \n\t" // 1942 "pmaxub %%mm1, %%mm0 \n\t" //
1980 "pminub %%mm0, %%mm2 \n\t" 1943 "pminub %%mm0, %%mm2 \n\t"
1981 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 1944 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1982 1945
1983 1946
1984 : : "r" (src), "r" ((long)stride) 1947 : : "r" (src), "r" ((long)stride)
1985 : "%"REG_a, "%"REG_d 1948 : "%"REG_a, "%"REG_d
1986 ); 1949 );
1987 1950
1988 #else // MMX without MMX2 1951 #else // MMX without MMX2
1989 asm volatile( 1952 asm volatile(
1990 "lea (%0, %1), %%"REG_a" \n\t" 1953 "lea (%0, %1), %%"REG_a" \n\t"
1991 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1954 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1992 // 0 1 2 3 4 5 6 7 8 9 1955 // 0 1 2 3 4 5 6 7 8 9
1993 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1956 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1994 "pxor %%mm7, %%mm7 \n\t" 1957 "pxor %%mm7, %%mm7 \n\t"
1995 1958
1996 #define REAL_MEDIAN(a,b,c)\ 1959 #define REAL_MEDIAN(a,b,c)\
1997 "movq " #a ", %%mm0 \n\t"\ 1960 "movq " #a ", %%mm0 \n\t"\
1998 "movq " #b ", %%mm2 \n\t"\ 1961 "movq " #b ", %%mm2 \n\t"\
1999 "movq " #c ", %%mm1 \n\t"\ 1962 "movq " #c ", %%mm1 \n\t"\
2000 "movq %%mm0, %%mm3 \n\t"\ 1963 "movq %%mm0, %%mm3 \n\t"\
2001 "movq %%mm1, %%mm4 \n\t"\ 1964 "movq %%mm1, %%mm4 \n\t"\
2002 "movq %%mm2, %%mm5 \n\t"\ 1965 "movq %%mm2, %%mm5 \n\t"\
2003 "psubusb %%mm1, %%mm3 \n\t"\ 1966 "psubusb %%mm1, %%mm3 \n\t"\
2004 "psubusb %%mm2, %%mm4 \n\t"\ 1967 "psubusb %%mm2, %%mm4 \n\t"\
2005 "psubusb %%mm0, %%mm5 \n\t"\ 1968 "psubusb %%mm0, %%mm5 \n\t"\
2006 "pcmpeqb %%mm7, %%mm3 \n\t"\ 1969 "pcmpeqb %%mm7, %%mm3 \n\t"\
2007 "pcmpeqb %%mm7, %%mm4 \n\t"\ 1970 "pcmpeqb %%mm7, %%mm4 \n\t"\
2008 "pcmpeqb %%mm7, %%mm5 \n\t"\ 1971 "pcmpeqb %%mm7, %%mm5 \n\t"\
2009 "movq %%mm3, %%mm6 \n\t"\ 1972 "movq %%mm3, %%mm6 \n\t"\
2010 "pxor %%mm4, %%mm3 \n\t"\ 1973 "pxor %%mm4, %%mm3 \n\t"\
2011 "pxor %%mm5, %%mm4 \n\t"\ 1974 "pxor %%mm5, %%mm4 \n\t"\
2012 "pxor %%mm6, %%mm5 \n\t"\ 1975 "pxor %%mm6, %%mm5 \n\t"\
2013 "por %%mm3, %%mm1 \n\t"\ 1976 "por %%mm3, %%mm1 \n\t"\
2014 "por %%mm4, %%mm2 \n\t"\ 1977 "por %%mm4, %%mm2 \n\t"\
2015 "por %%mm5, %%mm0 \n\t"\ 1978 "por %%mm5, %%mm0 \n\t"\
2016 "pand %%mm2, %%mm0 \n\t"\ 1979 "pand %%mm2, %%mm0 \n\t"\
2017 "pand %%mm1, %%mm0 \n\t"\ 1980 "pand %%mm1, %%mm0 \n\t"\
2018 "movq %%mm0, " #b " \n\t" 1981 "movq %%mm0, " #b " \n\t"
2019 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) 1982 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
2020 1983
2021 MEDIAN((%0) , (%%REGa) , (%%REGa, %1)) 1984 MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
2022 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) 1985 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
2023 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1)) 1986 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
2024 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) 1987 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
2025 1988
2026 : : "r" (src), "r" ((long)stride) 1989 : : "r" (src), "r" ((long)stride)
2027 : "%"REG_a, "%"REG_d 1990 : "%"REG_a, "%"REG_d
2028 ); 1991 );
2029 #endif //HAVE_MMX2 1992 #endif //HAVE_MMX2
2030 #else //HAVE_MMX 1993 #else //HAVE_MMX
2031 int x, y; 1994 int x, y;
2032 src+= 4*stride; 1995 src+= 4*stride;
2033 // FIXME - there should be a way to do a few columns in parallel like w/mmx 1996 // FIXME - there should be a way to do a few columns in parallel like w/mmx
2034 for(x=0; x<8; x++) 1997 for(x=0; x<8; x++){
2035 { 1998 uint8_t *colsrc = src;
2036 uint8_t *colsrc = src; 1999 for (y=0; y<4; y++){
2037 for (y=0; y<4; y++) 2000 int a, b, c, d, e, f;
2038 { 2001 a = colsrc[0 ];
2039 int a, b, c, d, e, f; 2002 b = colsrc[stride ];
2040 a = colsrc[0 ]; 2003 c = colsrc[stride*2];
2041 b = colsrc[stride ]; 2004 d = (a-b)>>31;
2042 c = colsrc[stride*2]; 2005 e = (b-c)>>31;
2043 d = (a-b)>>31; 2006 f = (c-a)>>31;
2044 e = (b-c)>>31; 2007 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2045 f = (c-a)>>31; 2008 colsrc += stride*2;
2046 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2047 colsrc += stride*2;
2048 }
2049 src++;
2050 } 2009 }
2010 src++;
2011 }
2051 #endif //HAVE_MMX 2012 #endif //HAVE_MMX
2052 } 2013 }
2053 2014
2054 #ifdef HAVE_MMX 2015 #ifdef HAVE_MMX
2055 /** 2016 /**
2056 * transposes and shift the given 8x8 Block into dst1 and dst2 2017 * transposes and shift the given 8x8 Block into dst1 and dst2
2057 */ 2018 */
2058 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 2019 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2059 { 2020 {
2060 asm( 2021 asm(
2061 "lea (%0, %1), %%"REG_a" \n\t" 2022 "lea (%0, %1), %%"REG_a" \n\t"
2062 // 0 1 2 3 4 5 6 7 8 9 2023 // 0 1 2 3 4 5 6 7 8 9
2063 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2024 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2064 "movq (%0), %%mm0 \n\t" // 12345678 2025 "movq (%0), %%mm0 \n\t" // 12345678
2065 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 2026 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
2066 "movq %%mm0, %%mm2 \n\t" // 12345678 2027 "movq %%mm0, %%mm2 \n\t" // 12345678
2067 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2028 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2068 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2029 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2069 2030
2070 "movq (%%"REG_a", %1), %%mm1 \n\t" 2031 "movq (%%"REG_a", %1), %%mm1 \n\t"
2071 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 2032 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
2072 "movq %%mm1, %%mm4 \n\t" 2033 "movq %%mm1, %%mm4 \n\t"
2073 "punpcklbw %%mm3, %%mm1 \n\t" 2034 "punpcklbw %%mm3, %%mm1 \n\t"
2074 "punpckhbw %%mm3, %%mm4 \n\t" 2035 "punpckhbw %%mm3, %%mm4 \n\t"
2075 2036
2076 "movq %%mm0, %%mm3 \n\t" 2037 "movq %%mm0, %%mm3 \n\t"
2077 "punpcklwd %%mm1, %%mm0 \n\t" 2038 "punpcklwd %%mm1, %%mm0 \n\t"
2078 "punpckhwd %%mm1, %%mm3 \n\t" 2039 "punpckhwd %%mm1, %%mm3 \n\t"
2079 "movq %%mm2, %%mm1 \n\t" 2040 "movq %%mm2, %%mm1 \n\t"
2080 "punpcklwd %%mm4, %%mm2 \n\t" 2041 "punpcklwd %%mm4, %%mm2 \n\t"
2081 "punpckhwd %%mm4, %%mm1 \n\t" 2042 "punpckhwd %%mm4, %%mm1 \n\t"
2082 2043
2083 "movd %%mm0, 128(%2) \n\t" 2044 "movd %%mm0, 128(%2) \n\t"
2084 "psrlq $32, %%mm0 \n\t" 2045 "psrlq $32, %%mm0 \n\t"
2085 "movd %%mm0, 144(%2) \n\t" 2046 "movd %%mm0, 144(%2) \n\t"
2086 "movd %%mm3, 160(%2) \n\t" 2047 "movd %%mm3, 160(%2) \n\t"
2087 "psrlq $32, %%mm3 \n\t" 2048 "psrlq $32, %%mm3 \n\t"
2088 "movd %%mm3, 176(%2) \n\t" 2049 "movd %%mm3, 176(%2) \n\t"
2089 "movd %%mm3, 48(%3) \n\t" 2050 "movd %%mm3, 48(%3) \n\t"
2090 "movd %%mm2, 192(%2) \n\t" 2051 "movd %%mm2, 192(%2) \n\t"
2091 "movd %%mm2, 64(%3) \n\t" 2052 "movd %%mm2, 64(%3) \n\t"
2092 "psrlq $32, %%mm2 \n\t" 2053 "psrlq $32, %%mm2 \n\t"
2093 "movd %%mm2, 80(%3) \n\t" 2054 "movd %%mm2, 80(%3) \n\t"
2094 "movd %%mm1, 96(%3) \n\t" 2055 "movd %%mm1, 96(%3) \n\t"
2095 "psrlq $32, %%mm1 \n\t" 2056 "psrlq $32, %%mm1 \n\t"
2096 "movd %%mm1, 112(%3) \n\t" 2057 "movd %%mm1, 112(%3) \n\t"
2097 2058
2098 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" 2059 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
2099 2060
2100 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 2061 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2101 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 2062 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh
2102 "movq %%mm0, %%mm2 \n\t" // 12345678 2063 "movq %%mm0, %%mm2 \n\t" // 12345678
2103 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2064 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2104 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2065 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2105 2066
2106 "movq (%%"REG_a", %1), %%mm1 \n\t" 2067 "movq (%%"REG_a", %1), %%mm1 \n\t"
2107 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 2068 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
2108 "movq %%mm1, %%mm4 \n\t" 2069 "movq %%mm1, %%mm4 \n\t"
2109 "punpcklbw %%mm3, %%mm1 \n\t" 2070 "punpcklbw %%mm3, %%mm1 \n\t"
2110 "punpckhbw %%mm3, %%mm4 \n\t" 2071 "punpckhbw %%mm3, %%mm4 \n\t"
2111 2072
2112 "movq %%mm0, %%mm3 \n\t" 2073 "movq %%mm0, %%mm3 \n\t"
2113 "punpcklwd %%mm1, %%mm0 \n\t" 2074 "punpcklwd %%mm1, %%mm0 \n\t"
2114 "punpckhwd %%mm1, %%mm3 \n\t" 2075 "punpckhwd %%mm1, %%mm3 \n\t"
2115 "movq %%mm2, %%mm1 \n\t" 2076 "movq %%mm2, %%mm1 \n\t"
2116 "punpcklwd %%mm4, %%mm2 \n\t" 2077 "punpcklwd %%mm4, %%mm2 \n\t"
2117 "punpckhwd %%mm4, %%mm1 \n\t" 2078 "punpckhwd %%mm4, %%mm1 \n\t"
2118 2079
2119 "movd %%mm0, 132(%2) \n\t" 2080 "movd %%mm0, 132(%2) \n\t"
2120 "psrlq $32, %%mm0 \n\t" 2081 "psrlq $32, %%mm0 \n\t"
2121 "movd %%mm0, 148(%2) \n\t" 2082 "movd %%mm0, 148(%2) \n\t"
2122 "movd %%mm3, 164(%2) \n\t" 2083 "movd %%mm3, 164(%2) \n\t"
2123 "psrlq $32, %%mm3 \n\t" 2084 "psrlq $32, %%mm3 \n\t"
2124 "movd %%mm3, 180(%2) \n\t" 2085 "movd %%mm3, 180(%2) \n\t"
2125 "movd %%mm3, 52(%3) \n\t" 2086 "movd %%mm3, 52(%3) \n\t"
2126 "movd %%mm2, 196(%2) \n\t" 2087 "movd %%mm2, 196(%2) \n\t"
2127 "movd %%mm2, 68(%3) \n\t" 2088 "movd %%mm2, 68(%3) \n\t"
2128 "psrlq $32, %%mm2 \n\t" 2089 "psrlq $32, %%mm2 \n\t"
2129 "movd %%mm2, 84(%3) \n\t" 2090 "movd %%mm2, 84(%3) \n\t"
2130 "movd %%mm1, 100(%3) \n\t" 2091 "movd %%mm1, 100(%3) \n\t"
2131 "psrlq $32, %%mm1 \n\t" 2092 "psrlq $32, %%mm1 \n\t"
2132 "movd %%mm1, 116(%3) \n\t" 2093 "movd %%mm1, 116(%3) \n\t"
2133 2094
2134 2095
2135 :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2) 2096 :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
2136 : "%"REG_a 2097 : "%"REG_a
2137 ); 2098 );
2138 } 2099 }
2139 2100
2140 /** 2101 /**
2141 * transposes the given 8x8 block 2102 * transposes the given 8x8 block
2142 */ 2103 */
2143 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) 2104 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2144 { 2105 {
2145 asm( 2106 asm(
2146 "lea (%0, %1), %%"REG_a" \n\t" 2107 "lea (%0, %1), %%"REG_a" \n\t"
2147 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t" 2108 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
2148 // 0 1 2 3 4 5 6 7 8 9 2109 // 0 1 2 3 4 5 6 7 8 9
2149 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 2110 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2150 "movq (%2), %%mm0 \n\t" // 12345678 2111 "movq (%2), %%mm0 \n\t" // 12345678
2151 "movq 16(%2), %%mm1 \n\t" // abcdefgh 2112 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2152 "movq %%mm0, %%mm2 \n\t" // 12345678 2113 "movq %%mm0, %%mm2 \n\t" // 12345678
2153 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2114 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2154 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2115 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2155 2116
2156 "movq 32(%2), %%mm1 \n\t" 2117 "movq 32(%2), %%mm1 \n\t"
2157 "movq 48(%2), %%mm3 \n\t" 2118 "movq 48(%2), %%mm3 \n\t"
2158 "movq %%mm1, %%mm4 \n\t" 2119 "movq %%mm1, %%mm4 \n\t"
2159 "punpcklbw %%mm3, %%mm1 \n\t" 2120 "punpcklbw %%mm3, %%mm1 \n\t"
2160 "punpckhbw %%mm3, %%mm4 \n\t" 2121 "punpckhbw %%mm3, %%mm4 \n\t"
2161 2122
2162 "movq %%mm0, %%mm3 \n\t" 2123 "movq %%mm0, %%mm3 \n\t"
2163 "punpcklwd %%mm1, %%mm0 \n\t" 2124 "punpcklwd %%mm1, %%mm0 \n\t"
2164 "punpckhwd %%mm1, %%mm3 \n\t" 2125 "punpckhwd %%mm1, %%mm3 \n\t"
2165 "movq %%mm2, %%mm1 \n\t" 2126 "movq %%mm2, %%mm1 \n\t"
2166 "punpcklwd %%mm4, %%mm2 \n\t" 2127 "punpcklwd %%mm4, %%mm2 \n\t"
2167 "punpckhwd %%mm4, %%mm1 \n\t" 2128 "punpckhwd %%mm4, %%mm1 \n\t"
2168 2129
2169 "movd %%mm0, (%0) \n\t" 2130 "movd %%mm0, (%0) \n\t"
2170 "psrlq $32, %%mm0 \n\t" 2131 "psrlq $32, %%mm0 \n\t"
2171 "movd %%mm0, (%%"REG_a") \n\t" 2132 "movd %%mm0, (%%"REG_a") \n\t"
2172 "movd %%mm3, (%%"REG_a", %1) \n\t" 2133 "movd %%mm3, (%%"REG_a", %1) \n\t"
2173 "psrlq $32, %%mm3 \n\t" 2134 "psrlq $32, %%mm3 \n\t"
2174 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" 2135 "movd %%mm3, (%%"REG_a", %1, 2) \n\t"
2175 "movd %%mm2, (%0, %1, 4) \n\t" 2136 "movd %%mm2, (%0, %1, 4) \n\t"
2176 "psrlq $32, %%mm2 \n\t" 2137 "psrlq $32, %%mm2 \n\t"
2177 "movd %%mm2, (%%"REG_d") \n\t" 2138 "movd %%mm2, (%%"REG_d") \n\t"
2178 "movd %%mm1, (%%"REG_d", %1) \n\t" 2139 "movd %%mm1, (%%"REG_d", %1) \n\t"
2179 "psrlq $32, %%mm1 \n\t" 2140 "psrlq $32, %%mm1 \n\t"
2180 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" 2141 "movd %%mm1, (%%"REG_d", %1, 2) \n\t"
2181 2142
2182 2143
2183 "movq 64(%2), %%mm0 \n\t" // 12345678 2144 "movq 64(%2), %%mm0 \n\t" // 12345678
2184 "movq 80(%2), %%mm1 \n\t" // abcdefgh 2145 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2185 "movq %%mm0, %%mm2 \n\t" // 12345678 2146 "movq %%mm0, %%mm2 \n\t" // 12345678
2186 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 2147 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2187 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 2148 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2188 2149
2189 "movq 96(%2), %%mm1 \n\t" 2150 "movq 96(%2), %%mm1 \n\t"
2190 "movq 112(%2), %%mm3 \n\t" 2151 "movq 112(%2), %%mm3 \n\t"
2191 "movq %%mm1, %%mm4 \n\t" 2152 "movq %%mm1, %%mm4 \n\t"
2192 "punpcklbw %%mm3, %%mm1 \n\t" 2153 "punpcklbw %%mm3, %%mm1 \n\t"
2193 "punpckhbw %%mm3, %%mm4 \n\t" 2154 "punpckhbw %%mm3, %%mm4 \n\t"
2194 2155
2195 "movq %%mm0, %%mm3 \n\t" 2156 "movq %%mm0, %%mm3 \n\t"
2196 "punpcklwd %%mm1, %%mm0 \n\t" 2157 "punpcklwd %%mm1, %%mm0 \n\t"
2197 "punpckhwd %%mm1, %%mm3 \n\t" 2158 "punpckhwd %%mm1, %%mm3 \n\t"
2198 "movq %%mm2, %%mm1 \n\t" 2159 "movq %%mm2, %%mm1 \n\t"
2199 "punpcklwd %%mm4, %%mm2 \n\t" 2160 "punpcklwd %%mm4, %%mm2 \n\t"
2200 "punpckhwd %%mm4, %%mm1 \n\t" 2161 "punpckhwd %%mm4, %%mm1 \n\t"
2201 2162
2202 "movd %%mm0, 4(%0) \n\t" 2163 "movd %%mm0, 4(%0) \n\t"
2203 "psrlq $32, %%mm0 \n\t" 2164 "psrlq $32, %%mm0 \n\t"
2204 "movd %%mm0, 4(%%"REG_a") \n\t" 2165 "movd %%mm0, 4(%%"REG_a") \n\t"
2205 "movd %%mm3, 4(%%"REG_a", %1) \n\t" 2166 "movd %%mm3, 4(%%"REG_a", %1) \n\t"
2206 "psrlq $32, %%mm3 \n\t" 2167 "psrlq $32, %%mm3 \n\t"
2207 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" 2168 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
2208 "movd %%mm2, 4(%0, %1, 4) \n\t" 2169 "movd %%mm2, 4(%0, %1, 4) \n\t"
2209 "psrlq $32, %%mm2 \n\t" 2170 "psrlq $32, %%mm2 \n\t"
2210 "movd %%mm2, 4(%%"REG_d") \n\t" 2171 "movd %%mm2, 4(%%"REG_d") \n\t"
2211 "movd %%mm1, 4(%%"REG_d", %1) \n\t" 2172 "movd %%mm1, 4(%%"REG_d", %1) \n\t"
2212 "psrlq $32, %%mm1 \n\t" 2173 "psrlq $32, %%mm1 \n\t"
2213 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" 2174 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
2214 2175
2215 :: "r" (dst), "r" ((long)dstStride), "r" (src) 2176 :: "r" (dst), "r" ((long)dstStride), "r" (src)
2216 : "%"REG_a, "%"REG_d 2177 : "%"REG_a, "%"REG_d
2217 ); 2178 );
2218 } 2179 }
2219 #endif //HAVE_MMX 2180 #endif //HAVE_MMX
2220 //static long test=0; 2181 //static long test=0;
2221 2182
2222 #ifndef HAVE_ALTIVEC 2183 #ifndef HAVE_ALTIVEC
2223 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 2184 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2224 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) 2185 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2225 { 2186 {
2226 // to save a register (FIXME do this outside of the loops) 2187 // to save a register (FIXME do this outside of the loops)
2227 tempBluredPast[127]= maxNoise[0]; 2188 tempBluredPast[127]= maxNoise[0];
2228 tempBluredPast[128]= maxNoise[1]; 2189 tempBluredPast[128]= maxNoise[1];
2229 tempBluredPast[129]= maxNoise[2]; 2190 tempBluredPast[129]= maxNoise[2];
2230 2191
2231 #define FAST_L2_DIFF 2192 #define FAST_L2_DIFF
2232 //#define L1_DIFF //u should change the thresholds too if u try that one 2193 //#define L1_DIFF //u should change the thresholds too if u try that one
2233 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2194 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2234 asm volatile( 2195 asm volatile(
2235 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
2236 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
2237 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2238 // 0 1 2 3 4 5 6 7 8 9 2199 // 0 1 2 3 4 5 6 7 8 9
2239 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 2200 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2240 //FIXME reorder? 2201 //FIXME reorder?
2241 #ifdef L1_DIFF //needs mmx2 2202 #ifdef L1_DIFF //needs mmx2
2242 "movq (%0), %%mm0 \n\t" // L0 2203 "movq (%0), %%mm0 \n\t" // L0
2243 "psadbw (%1), %%mm0 \n\t" // |L0-R0| 2204 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2244 "movq (%0, %2), %%mm1 \n\t" // L1 2205 "movq (%0, %2), %%mm1 \n\t" // L1
2245 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| 2206 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2246 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2207 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2247 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| 2208 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2248 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2209 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2249 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| 2210 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3|
2250 2211
2251 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2212 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2252 "paddw %%mm1, %%mm0 \n\t" 2213 "paddw %%mm1, %%mm0 \n\t"
2253 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| 2214 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2254 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2215 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2255 "paddw %%mm2, %%mm0 \n\t" 2216 "paddw %%mm2, %%mm0 \n\t"
2256 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| 2217 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5|
2257 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2218 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2258 "paddw %%mm3, %%mm0 \n\t" 2219 "paddw %%mm3, %%mm0 \n\t"
2259 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| 2220 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6|
2260 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2221 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2261 "paddw %%mm4, %%mm0 \n\t" 2222 "paddw %%mm4, %%mm0 \n\t"
2262 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| 2223 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7|
2263 "paddw %%mm5, %%mm6 \n\t" 2224 "paddw %%mm5, %%mm6 \n\t"
2264 "paddw %%mm7, %%mm6 \n\t" 2225 "paddw %%mm7, %%mm6 \n\t"
2265 "paddw %%mm6, %%mm0 \n\t" 2226 "paddw %%mm6, %%mm0 \n\t"
2266 #else //L1_DIFF 2227 #else //L1_DIFF
2267 #if defined (FAST_L2_DIFF) 2228 #if defined (FAST_L2_DIFF)
2268 "pcmpeqb %%mm7, %%mm7 \n\t" 2229 "pcmpeqb %%mm7, %%mm7 \n\t"
2269 "movq "MANGLE(b80)", %%mm6 \n\t" 2230 "movq "MANGLE(b80)", %%mm6 \n\t"
2270 "pxor %%mm0, %%mm0 \n\t" 2231 "pxor %%mm0, %%mm0 \n\t"
2271 #define REAL_L2_DIFF_CORE(a, b)\ 2232 #define REAL_L2_DIFF_CORE(a, b)\
2272 "movq " #a ", %%mm5 \n\t"\ 2233 "movq " #a ", %%mm5 \n\t"\
2273 "movq " #b ", %%mm2 \n\t"\ 2234 "movq " #b ", %%mm2 \n\t"\
2274 "pxor %%mm7, %%mm2 \n\t"\ 2235 "pxor %%mm7, %%mm2 \n\t"\
2275 PAVGB(%%mm2, %%mm5)\ 2236 PAVGB(%%mm2, %%mm5)\
2276 "paddb %%mm6, %%mm5 \n\t"\ 2237 "paddb %%mm6, %%mm5 \n\t"\
2277 "movq %%mm5, %%mm2 \n\t"\ 2238 "movq %%mm5, %%mm2 \n\t"\
2278 "psllw $8, %%mm5 \n\t"\ 2239 "psllw $8, %%mm5 \n\t"\
2279 "pmaddwd %%mm5, %%mm5 \n\t"\ 2240 "pmaddwd %%mm5, %%mm5 \n\t"\
2280 "pmaddwd %%mm2, %%mm2 \n\t"\ 2241 "pmaddwd %%mm2, %%mm2 \n\t"\
2281 "paddd %%mm2, %%mm5 \n\t"\ 2242 "paddd %%mm2, %%mm5 \n\t"\
2282 "psrld $14, %%mm5 \n\t"\ 2243 "psrld $14, %%mm5 \n\t"\
2283 "paddd %%mm5, %%mm0 \n\t" 2244 "paddd %%mm5, %%mm0 \n\t"
2284 2245
2285 #else //defined (FAST_L2_DIFF) 2246 #else //defined (FAST_L2_DIFF)
2286 "pxor %%mm7, %%mm7 \n\t" 2247 "pxor %%mm7, %%mm7 \n\t"
2287 "pxor %%mm0, %%mm0 \n\t" 2248 "pxor %%mm0, %%mm0 \n\t"
2288 #define REAL_L2_DIFF_CORE(a, b)\ 2249 #define REAL_L2_DIFF_CORE(a, b)\
2289 "movq " #a ", %%mm5 \n\t"\ 2250 "movq " #a ", %%mm5 \n\t"\
2290 "movq " #b ", %%mm2 \n\t"\ 2251 "movq " #b ", %%mm2 \n\t"\
2291 "movq %%mm5, %%mm1 \n\t"\ 2252 "movq %%mm5, %%mm1 \n\t"\
2292 "movq %%mm2, %%mm3 \n\t"\ 2253 "movq %%mm2, %%mm3 \n\t"\
2293 "punpcklbw %%mm7, %%mm5 \n\t"\ 2254 "punpcklbw %%mm7, %%mm5 \n\t"\
2294 "punpckhbw %%mm7, %%mm1 \n\t"\ 2255 "punpckhbw %%mm7, %%mm1 \n\t"\
2295 "punpcklbw %%mm7, %%mm2 \n\t"\ 2256 "punpcklbw %%mm7, %%mm2 \n\t"\
2296 "punpckhbw %%mm7, %%mm3 \n\t"\ 2257 "punpckhbw %%mm7, %%mm3 \n\t"\
2297 "psubw %%mm2, %%mm5 \n\t"\ 2258 "psubw %%mm2, %%mm5 \n\t"\
2298 "psubw %%mm3, %%mm1 \n\t"\ 2259 "psubw %%mm3, %%mm1 \n\t"\
2299 "pmaddwd %%mm5, %%mm5 \n\t"\ 2260 "pmaddwd %%mm5, %%mm5 \n\t"\
2300 "pmaddwd %%mm1, %%mm1 \n\t"\ 2261 "pmaddwd %%mm1, %%mm1 \n\t"\
2301 "paddd %%mm1, %%mm5 \n\t"\ 2262 "paddd %%mm1, %%mm5 \n\t"\
2302 "paddd %%mm5, %%mm0 \n\t" 2263 "paddd %%mm5, %%mm0 \n\t"
2303 2264
2304 #endif //defined (FAST_L2_DIFF) 2265 #endif //defined (FAST_L2_DIFF)
2305 2266
2306 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) 2267 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2307 2268
2314 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) 2275 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
2315 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc)) 2276 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
2316 2277
2317 #endif //L1_DIFF 2278 #endif //L1_DIFF
2318 2279
2319 "movq %%mm0, %%mm4 \n\t" 2280 "movq %%mm0, %%mm4 \n\t"
2320 "psrlq $32, %%mm0 \n\t" 2281 "psrlq $32, %%mm0 \n\t"
2321 "paddd %%mm0, %%mm4 \n\t" 2282 "paddd %%mm0, %%mm4 \n\t"
2322 "movd %%mm4, %%ecx \n\t" 2283 "movd %%mm4, %%ecx \n\t"
2323 "shll $2, %%ecx \n\t" 2284 "shll $2, %%ecx \n\t"
2324 "mov %3, %%"REG_d" \n\t" 2285 "mov %3, %%"REG_d" \n\t"
2325 "addl -4(%%"REG_d"), %%ecx \n\t" 2286 "addl -4(%%"REG_d"), %%ecx \n\t"
2326 "addl 4(%%"REG_d"), %%ecx \n\t" 2287 "addl 4(%%"REG_d"), %%ecx \n\t"
2327 "addl -1024(%%"REG_d"), %%ecx \n\t" 2288 "addl -1024(%%"REG_d"), %%ecx \n\t"
2328 "addl $4, %%ecx \n\t" 2289 "addl $4, %%ecx \n\t"
2329 "addl 1024(%%"REG_d"), %%ecx \n\t" 2290 "addl 1024(%%"REG_d"), %%ecx \n\t"
2330 "shrl $3, %%ecx \n\t" 2291 "shrl $3, %%ecx \n\t"
2331 "movl %%ecx, (%%"REG_d") \n\t" 2292 "movl %%ecx, (%%"REG_d") \n\t"
2332 2293
2333 // "mov %3, %%"REG_c" \n\t" 2294 // "mov %3, %%"REG_c" \n\t"
2334 // "mov %%"REG_c", test \n\t" 2295 // "mov %%"REG_c", test \n\t"
2335 // "jmp 4f \n\t" 2296 // "jmp 4f \n\t"
2336 "cmpl 512(%%"REG_d"), %%ecx \n\t" 2297 "cmpl 512(%%"REG_d"), %%ecx \n\t"
2337 " jb 2f \n\t" 2298 " jb 2f \n\t"
2338 "cmpl 516(%%"REG_d"), %%ecx \n\t" 2299 "cmpl 516(%%"REG_d"), %%ecx \n\t"
2339 " jb 1f \n\t" 2300 " jb 1f \n\t"
2340 2301
2341 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2302 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2342 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2303 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2343 "movq (%0), %%mm0 \n\t" // L0 2304 "movq (%0), %%mm0 \n\t" // L0
2344 "movq (%0, %2), %%mm1 \n\t" // L1 2305 "movq (%0, %2), %%mm1 \n\t" // L1
2345 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2306 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2346 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2307 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2347 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2308 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2348 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2309 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2349 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2310 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2350 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2311 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2351 "movq %%mm0, (%1) \n\t" // L0 2312 "movq %%mm0, (%1) \n\t" // L0
2352 "movq %%mm1, (%1, %2) \n\t" // L1 2313 "movq %%mm1, (%1, %2) \n\t" // L1
2353 "movq %%mm2, (%1, %2, 2) \n\t" // L2 2314 "movq %%mm2, (%1, %2, 2) \n\t" // L2
2354 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 2315 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3
2355 "movq %%mm4, (%1, %2, 4) \n\t" // L4 2316 "movq %%mm4, (%1, %2, 4) \n\t" // L4
2356 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 2317 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5
2357 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 2318 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6
2358 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 2319 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7
2359 "jmp 4f \n\t" 2320 "jmp 4f \n\t"
2360 2321
2361 "1: \n\t" 2322 "1: \n\t"
2362 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2323 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2363 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2324 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2364 "movq (%0), %%mm0 \n\t" // L0 2325 "movq (%0), %%mm0 \n\t" // L0
2365 PAVGB((%1), %%mm0) // L0 2326 PAVGB((%1), %%mm0) // L0
2366 "movq (%0, %2), %%mm1 \n\t" // L1 2327 "movq (%0, %2), %%mm1 \n\t" // L1
2367 PAVGB((%1, %2), %%mm1) // L1 2328 PAVGB((%1, %2), %%mm1) // L1
2368 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2329 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2369 PAVGB((%1, %2, 2), %%mm2) // L2 2330 PAVGB((%1, %2, 2), %%mm2) // L2
2370 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2331 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2371 PAVGB((%1, %%REGa), %%mm3) // L3 2332 PAVGB((%1, %%REGa), %%mm3) // L3
2372 "movq (%0, %2, 4), %%mm4 \n\t" // L4 2333 "movq (%0, %2, 4), %%mm4 \n\t" // L4
2373 PAVGB((%1, %2, 4), %%mm4) // L4 2334 PAVGB((%1, %2, 4), %%mm4) // L4
2374 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 2335 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2375 PAVGB((%1, %%REGd), %%mm5) // L5 2336 PAVGB((%1, %%REGd), %%mm5) // L5
2376 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 2337 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2377 PAVGB((%1, %%REGa, 2), %%mm6) // L6 2338 PAVGB((%1, %%REGa, 2), %%mm6) // L6
2378 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 2339 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2379 PAVGB((%1, %%REGc), %%mm7) // L7 2340 PAVGB((%1, %%REGc), %%mm7) // L7
2380 "movq %%mm0, (%1) \n\t" // R0 2341 "movq %%mm0, (%1) \n\t" // R0
2381 "movq %%mm1, (%1, %2) \n\t" // R1 2342 "movq %%mm1, (%1, %2) \n\t" // R1
2382 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2343 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2383 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2344 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2384 "movq %%mm4, (%1, %2, 4) \n\t" // R4 2345 "movq %%mm4, (%1, %2, 4) \n\t" // R4
2385 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 2346 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5
2386 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 2347 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6
2387 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 2348 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7
2388 "movq %%mm0, (%0) \n\t" // L0 2349 "movq %%mm0, (%0) \n\t" // L0
2389 "movq %%mm1, (%0, %2) \n\t" // L1 2350 "movq %%mm1, (%0, %2) \n\t" // L1
2390 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2351 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2391 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2352 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2392 "movq %%mm4, (%0, %2, 4) \n\t" // L4 2353 "movq %%mm4, (%0, %2, 4) \n\t" // L4
2393 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 2354 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5
2394 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 2355 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6
2395 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 2356 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7
2396 "jmp 4f \n\t" 2357 "jmp 4f \n\t"
2397 2358
2398 "2: \n\t" 2359 "2: \n\t"
2399 "cmpl 508(%%"REG_d"), %%ecx \n\t" 2360 "cmpl 508(%%"REG_d"), %%ecx \n\t"
2400 " jb 3f \n\t" 2361 " jb 3f \n\t"
2401 2362
2402 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2363 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2403 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2364 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2404 "movq (%0), %%mm0 \n\t" // L0 2365 "movq (%0), %%mm0 \n\t" // L0
2405 "movq (%0, %2), %%mm1 \n\t" // L1 2366 "movq (%0, %2), %%mm1 \n\t" // L1
2406 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2367 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2407 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2368 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2408 "movq (%1), %%mm4 \n\t" // R0 2369 "movq (%1), %%mm4 \n\t" // R0
2409 "movq (%1, %2), %%mm5 \n\t" // R1 2370 "movq (%1, %2), %%mm5 \n\t" // R1
2410 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2371 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2411 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2372 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2412 PAVGB(%%mm4, %%mm0) 2373 PAVGB(%%mm4, %%mm0)
2413 PAVGB(%%mm5, %%mm1) 2374 PAVGB(%%mm5, %%mm1)
2414 PAVGB(%%mm6, %%mm2) 2375 PAVGB(%%mm6, %%mm2)
2415 PAVGB(%%mm7, %%mm3) 2376 PAVGB(%%mm7, %%mm3)
2416 PAVGB(%%mm4, %%mm0) 2377 PAVGB(%%mm4, %%mm0)
2417 PAVGB(%%mm5, %%mm1) 2378 PAVGB(%%mm5, %%mm1)
2418 PAVGB(%%mm6, %%mm2) 2379 PAVGB(%%mm6, %%mm2)
2419 PAVGB(%%mm7, %%mm3) 2380 PAVGB(%%mm7, %%mm3)
2420 "movq %%mm0, (%1) \n\t" // R0 2381 "movq %%mm0, (%1) \n\t" // R0
2421 "movq %%mm1, (%1, %2) \n\t" // R1 2382 "movq %%mm1, (%1, %2) \n\t" // R1
2422 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2383 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2423 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2384 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2424 "movq %%mm0, (%0) \n\t" // L0 2385 "movq %%mm0, (%0) \n\t" // L0
2425 "movq %%mm1, (%0, %2) \n\t" // L1 2386 "movq %%mm1, (%0, %2) \n\t" // L1
2426 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2387 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2427 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2388 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2428 2389
2429 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2390 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2430 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2391 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2431 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2392 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2432 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2393 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2433 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2394 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2434 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2395 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2435 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2396 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2436 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2397 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2437 PAVGB(%%mm4, %%mm0) 2398 PAVGB(%%mm4, %%mm0)
2438 PAVGB(%%mm5, %%mm1) 2399 PAVGB(%%mm5, %%mm1)
2439 PAVGB(%%mm6, %%mm2) 2400 PAVGB(%%mm6, %%mm2)
2440 PAVGB(%%mm7, %%mm3) 2401 PAVGB(%%mm7, %%mm3)
2441 PAVGB(%%mm4, %%mm0) 2402 PAVGB(%%mm4, %%mm0)
2442 PAVGB(%%mm5, %%mm1) 2403 PAVGB(%%mm5, %%mm1)
2443 PAVGB(%%mm6, %%mm2) 2404 PAVGB(%%mm6, %%mm2)
2444 PAVGB(%%mm7, %%mm3) 2405 PAVGB(%%mm7, %%mm3)
2445 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2406 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2446 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2407 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2447 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2408 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2448 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2409 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2449 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2410 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2450 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2411 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2451 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2412 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2452 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2413 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2453 "jmp 4f \n\t" 2414 "jmp 4f \n\t"
2454 2415
2455 "3: \n\t" 2416 "3: \n\t"
2456 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 2417 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2457 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2418 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2458 "movq (%0), %%mm0 \n\t" // L0 2419 "movq (%0), %%mm0 \n\t" // L0
2459 "movq (%0, %2), %%mm1 \n\t" // L1 2420 "movq (%0, %2), %%mm1 \n\t" // L1
2460 "movq (%0, %2, 2), %%mm2 \n\t" // L2 2421 "movq (%0, %2, 2), %%mm2 \n\t" // L2
2461 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 2422 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2462 "movq (%1), %%mm4 \n\t" // R0 2423 "movq (%1), %%mm4 \n\t" // R0
2463 "movq (%1, %2), %%mm5 \n\t" // R1 2424 "movq (%1, %2), %%mm5 \n\t" // R1
2464 "movq (%1, %2, 2), %%mm6 \n\t" // R2 2425 "movq (%1, %2, 2), %%mm6 \n\t" // R2
2465 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 2426 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2466 PAVGB(%%mm4, %%mm0) 2427 PAVGB(%%mm4, %%mm0)
2467 PAVGB(%%mm5, %%mm1) 2428 PAVGB(%%mm5, %%mm1)
2468 PAVGB(%%mm6, %%mm2) 2429 PAVGB(%%mm6, %%mm2)
2469 PAVGB(%%mm7, %%mm3) 2430 PAVGB(%%mm7, %%mm3)
2470 PAVGB(%%mm4, %%mm0) 2431 PAVGB(%%mm4, %%mm0)
2471 PAVGB(%%mm5, %%mm1) 2432 PAVGB(%%mm5, %%mm1)
2472 PAVGB(%%mm6, %%mm2) 2433 PAVGB(%%mm6, %%mm2)
2473 PAVGB(%%mm7, %%mm3) 2434 PAVGB(%%mm7, %%mm3)
2474 PAVGB(%%mm4, %%mm0) 2435 PAVGB(%%mm4, %%mm0)
2475 PAVGB(%%mm5, %%mm1) 2436 PAVGB(%%mm5, %%mm1)
2476 PAVGB(%%mm6, %%mm2) 2437 PAVGB(%%mm6, %%mm2)
2477 PAVGB(%%mm7, %%mm3) 2438 PAVGB(%%mm7, %%mm3)
2478 "movq %%mm0, (%1) \n\t" // R0 2439 "movq %%mm0, (%1) \n\t" // R0
2479 "movq %%mm1, (%1, %2) \n\t" // R1 2440 "movq %%mm1, (%1, %2) \n\t" // R1
2480 "movq %%mm2, (%1, %2, 2) \n\t" // R2 2441 "movq %%mm2, (%1, %2, 2) \n\t" // R2
2481 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 2442 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2482 "movq %%mm0, (%0) \n\t" // L0 2443 "movq %%mm0, (%0) \n\t" // L0
2483 "movq %%mm1, (%0, %2) \n\t" // L1 2444 "movq %%mm1, (%0, %2) \n\t" // L1
2484 "movq %%mm2, (%0, %2, 2) \n\t" // L2 2445 "movq %%mm2, (%0, %2, 2) \n\t" // L2
2485 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 2446 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2486 2447
2487 "movq (%0, %2, 4), %%mm0 \n\t" // L4 2448 "movq (%0, %2, 4), %%mm0 \n\t" // L4
2488 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 2449 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5
2489 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 2450 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6
2490 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 2451 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7
2491 "movq (%1, %2, 4), %%mm4 \n\t" // R4 2452 "movq (%1, %2, 4), %%mm4 \n\t" // R4
2492 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 2453 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5
2493 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 2454 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6
2494 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 2455 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7
2495 PAVGB(%%mm4, %%mm0) 2456 PAVGB(%%mm4, %%mm0)
2496 PAVGB(%%mm5, %%mm1) 2457 PAVGB(%%mm5, %%mm1)
2497 PAVGB(%%mm6, %%mm2) 2458 PAVGB(%%mm6, %%mm2)
2498 PAVGB(%%mm7, %%mm3) 2459 PAVGB(%%mm7, %%mm3)
2499 PAVGB(%%mm4, %%mm0) 2460 PAVGB(%%mm4, %%mm0)
2500 PAVGB(%%mm5, %%mm1) 2461 PAVGB(%%mm5, %%mm1)
2501 PAVGB(%%mm6, %%mm2) 2462 PAVGB(%%mm6, %%mm2)
2502 PAVGB(%%mm7, %%mm3) 2463 PAVGB(%%mm7, %%mm3)
2503 PAVGB(%%mm4, %%mm0) 2464 PAVGB(%%mm4, %%mm0)
2504 PAVGB(%%mm5, %%mm1) 2465 PAVGB(%%mm5, %%mm1)
2505 PAVGB(%%mm6, %%mm2) 2466 PAVGB(%%mm6, %%mm2)
2506 PAVGB(%%mm7, %%mm3) 2467 PAVGB(%%mm7, %%mm3)
2507 "movq %%mm0, (%1, %2, 4) \n\t" // R4 2468 "movq %%mm0, (%1, %2, 4) \n\t" // R4
2508 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 2469 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5
2509 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 2470 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6
2510 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 2471 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7
2511 "movq %%mm0, (%0, %2, 4) \n\t" // L4 2472 "movq %%mm0, (%0, %2, 4) \n\t" // L4
2512 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 2473 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5
2513 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 2474 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6
2514 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 2475 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7
2515 2476
2516 "4: \n\t" 2477 "4: \n\t"
2517 2478
2518 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast) 2479 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
2519 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2520 ); 2481 );
2521 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2482 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2522 { 2483 {
2523 int y; 2484 int y;
2524 int d=0; 2485 int d=0;
2525 // int sysd=0; 2486 // int sysd=0;
2526 int i; 2487 int i;
2527 2488
2528 for(y=0; y<8; y++) 2489 for(y=0; y<8; y++){
2529 { 2490 int x;
2530 int x; 2491 for(x=0; x<8; x++){
2531 for(x=0; x<8; x++) 2492 int ref= tempBlured[ x + y*stride ];
2532 { 2493 int cur= src[ x + y*stride ];
2533 int ref= tempBlured[ x + y*stride ]; 2494 int d1=ref - cur;
2534 int cur= src[ x + y*stride ]; 2495 // if(x==0 || x==7) d1+= d1>>1;
2535 int d1=ref - cur; 2496 // if(y==0 || y==7) d1+= d1>>1;
2536 // if(x==0 || x==7) d1+= d1>>1; 2497 // d+= FFABS(d1);
2537 // if(y==0 || y==7) d1+= d1>>1; 2498 d+= d1*d1;
2538 // d+= FFABS(d1); 2499 // sysd+= d1;
2539 d+= d1*d1;
2540 // sysd+= d1;
2541 }
2542 } 2500 }
2543 i=d; 2501 }
2544 d= ( 2502 i=d;
2545 4*d 2503 d= (
2546 +(*(tempBluredPast-256)) 2504 4*d
2547 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) 2505 +(*(tempBluredPast-256))
2548 +(*(tempBluredPast+256)) 2506 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2549 +4)>>3; 2507 +(*(tempBluredPast+256))
2550 *tempBluredPast=i; 2508 +4)>>3;
2551 // ((*tempBluredPast)*3 + d + 2)>>2; 2509 *tempBluredPast=i;
2510 // ((*tempBluredPast)*3 + d + 2)>>2;
2552 2511
2553 /* 2512 /*
2554 Switch between 2513 Switch between
2555 1 0 0 0 0 0 0 (0) 2514 1 0 0 0 0 0 0 (0)
2556 64 32 16 8 4 2 1 (1) 2515 64 32 16 8 4 2 1 (1)
2557 64 48 36 27 20 15 11 (33) (approx) 2516 64 48 36 27 20 15 11 (33) (approx)
2558 64 56 49 43 37 33 29 (200) (approx) 2517 64 56 49 43 37 33 29 (200) (approx)
2559 */ 2518 */
2560 if(d > maxNoise[1]) 2519 if(d > maxNoise[1]){
2561 { 2520 if(d < maxNoise[2]){
2562 if(d < maxNoise[2]) 2521 for(y=0; y<8; y++){
2563 { 2522 int x;
2564 for(y=0; y<8; y++) 2523 for(x=0; x<8; x++){
2565 { 2524 int ref= tempBlured[ x + y*stride ];
2566 int x; 2525 int cur= src[ x + y*stride ];
2567 for(x=0; x<8; x++) 2526 tempBlured[ x + y*stride ]=
2568 { 2527 src[ x + y*stride ]=
2569 int ref= tempBlured[ x + y*stride ]; 2528 (ref + cur + 1)>>1;
2570 int cur= src[ x + y*stride ];
2571 tempBlured[ x + y*stride ]=
2572 src[ x + y*stride ]=
2573 (ref + cur + 1)>>1;
2574 }
2575 }
2576 } 2529 }
2577 else 2530 }
2578 { 2531 }else{
2579 for(y=0; y<8; y++) 2532 for(y=0; y<8; y++){
2580 { 2533 int x;
2581 int x; 2534 for(x=0; x<8; x++){
2582 for(x=0; x<8; x++) 2535 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2583 {
2584 tempBlured[ x + y*stride ]= src[ x + y*stride ];
2585 }
2586 }
2587 } 2536 }
2537 }
2588 } 2538 }
2589 else 2539 }else{
2590 { 2540 if(d < maxNoise[0]){
2591 if(d < maxNoise[0]) 2541 for(y=0; y<8; y++){
2592 { 2542 int x;
2593 for(y=0; y<8; y++) 2543 for(x=0; x<8; x++){
2594 { 2544 int ref= tempBlured[ x + y*stride ];
2595 int x; 2545 int cur= src[ x + y*stride ];
2596 for(x=0; x<8; x++) 2546 tempBlured[ x + y*stride ]=
2597 { 2547 src[ x + y*stride ]=
2598 int ref= tempBlured[ x + y*stride ]; 2548 (ref*7 + cur + 4)>>3;
2599 int cur= src[ x + y*stride ];
2600 tempBlured[ x + y*stride ]=
2601 src[ x + y*stride ]=
2602 (ref*7 + cur + 4)>>3;
2603 }
2604 }
2605 } 2549 }
2606 else 2550 }
2607 { 2551 }else{
2608 for(y=0; y<8; y++) 2552 for(y=0; y<8; y++){
2609 { 2553 int x;
2610 int x; 2554 for(x=0; x<8; x++){
2611 for(x=0; x<8; x++) 2555 int ref= tempBlured[ x + y*stride ];
2612 { 2556 int cur= src[ x + y*stride ];
2613 int ref= tempBlured[ x + y*stride ]; 2557 tempBlured[ x + y*stride ]=
2614 int cur= src[ x + y*stride ]; 2558 src[ x + y*stride ]=
2615 tempBlured[ x + y*stride ]= 2559 (ref*3 + cur + 2)>>2;
2616 src[ x + y*stride ]=
2617 (ref*3 + cur + 2)>>2;
2618 }
2619 }
2620 } 2560 }
2561 }
2621 } 2562 }
2563 }
2622 } 2564 }
2623 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2565 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2624 } 2566 }
2625 #endif //HAVE_ALTIVEC 2567 #endif //HAVE_ALTIVEC
2626 2568
2627 #ifdef HAVE_MMX 2569 #ifdef HAVE_MMX
2628 /** 2570 /**
2629 * accurate deblock filter 2571 * accurate deblock filter
2630 */ 2572 */
2631 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 2573 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2632 int64_t dc_mask, eq_mask, both_masks; 2574 int64_t dc_mask, eq_mask, both_masks;
2633 int64_t sums[10*8*2]; 2575 int64_t sums[10*8*2];
2634 src+= step*3; // src points to begin of the 8x8 Block 2576 src+= step*3; // src points to begin of the 8x8 Block
2635 //START_TIMER 2577 //START_TIMER
2636 asm volatile( 2578 asm volatile(
2637 "movq %0, %%mm7 \n\t" 2579 "movq %0, %%mm7 \n\t"
2638 "movq %1, %%mm6 \n\t" 2580 "movq %1, %%mm6 \n\t"
2639 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 2581 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2640 ); 2582 );
2641 2583
2642 asm volatile( 2584 asm volatile(
2643 "lea (%2, %3), %%"REG_a" \n\t" 2585 "lea (%2, %3), %%"REG_a" \n\t"
2644 // 0 1 2 3 4 5 6 7 8 9 2586 // 0 1 2 3 4 5 6 7 8 9
2645 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 2587 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2646 2588
2647 "movq (%2), %%mm0 \n\t" 2589 "movq (%2), %%mm0 \n\t"
2648 "movq (%%"REG_a"), %%mm1 \n\t" 2590 "movq (%%"REG_a"), %%mm1 \n\t"
2649 "movq %%mm1, %%mm3 \n\t" 2591 "movq %%mm1, %%mm3 \n\t"
2650 "movq %%mm1, %%mm4 \n\t" 2592 "movq %%mm1, %%mm4 \n\t"
2651 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 2593 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
2652 "paddb %%mm7, %%mm0 \n\t" 2594 "paddb %%mm7, %%mm0 \n\t"
2653 "pcmpgtb %%mm6, %%mm0 \n\t" 2595 "pcmpgtb %%mm6, %%mm0 \n\t"
2654 2596
2655 "movq (%%"REG_a",%3), %%mm2 \n\t" 2597 "movq (%%"REG_a",%3), %%mm2 \n\t"
2656 PMAXUB(%%mm2, %%mm4) 2598 PMAXUB(%%mm2, %%mm4)
2657 PMINUB(%%mm2, %%mm3, %%mm5) 2599 PMINUB(%%mm2, %%mm3, %%mm5)
2658 "psubb %%mm2, %%mm1 \n\t" 2600 "psubb %%mm2, %%mm1 \n\t"
2659 "paddb %%mm7, %%mm1 \n\t" 2601 "paddb %%mm7, %%mm1 \n\t"
2660 "pcmpgtb %%mm6, %%mm1 \n\t" 2602 "pcmpgtb %%mm6, %%mm1 \n\t"
2661 "paddb %%mm1, %%mm0 \n\t" 2603 "paddb %%mm1, %%mm0 \n\t"
2662 2604
2663 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2605 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2664 PMAXUB(%%mm1, %%mm4) 2606 PMAXUB(%%mm1, %%mm4)
2665 PMINUB(%%mm1, %%mm3, %%mm5) 2607 PMINUB(%%mm1, %%mm3, %%mm5)
2666 "psubb %%mm1, %%mm2 \n\t" 2608 "psubb %%mm1, %%mm2 \n\t"
2667 "paddb %%mm7, %%mm2 \n\t" 2609 "paddb %%mm7, %%mm2 \n\t"
2668 "pcmpgtb %%mm6, %%mm2 \n\t" 2610 "pcmpgtb %%mm6, %%mm2 \n\t"
2669 "paddb %%mm2, %%mm0 \n\t" 2611 "paddb %%mm2, %%mm0 \n\t"
2670 2612
2671 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 2613 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
2672 2614
2673 "movq (%2, %3, 4), %%mm2 \n\t" 2615 "movq (%2, %3, 4), %%mm2 \n\t"
2674 PMAXUB(%%mm2, %%mm4) 2616 PMAXUB(%%mm2, %%mm4)
2675 PMINUB(%%mm2, %%mm3, %%mm5) 2617 PMINUB(%%mm2, %%mm3, %%mm5)
2676 "psubb %%mm2, %%mm1 \n\t" 2618 "psubb %%mm2, %%mm1 \n\t"
2677 "paddb %%mm7, %%mm1 \n\t" 2619 "paddb %%mm7, %%mm1 \n\t"
2678 "pcmpgtb %%mm6, %%mm1 \n\t" 2620 "pcmpgtb %%mm6, %%mm1 \n\t"
2679 "paddb %%mm1, %%mm0 \n\t" 2621 "paddb %%mm1, %%mm0 \n\t"
2680 2622
2681 "movq (%%"REG_a"), %%mm1 \n\t" 2623 "movq (%%"REG_a"), %%mm1 \n\t"
2682 PMAXUB(%%mm1, %%mm4) 2624 PMAXUB(%%mm1, %%mm4)
2683 PMINUB(%%mm1, %%mm3, %%mm5) 2625 PMINUB(%%mm1, %%mm3, %%mm5)
2684 "psubb %%mm1, %%mm2 \n\t" 2626 "psubb %%mm1, %%mm2 \n\t"
2685 "paddb %%mm7, %%mm2 \n\t" 2627 "paddb %%mm7, %%mm2 \n\t"
2686 "pcmpgtb %%mm6, %%mm2 \n\t" 2628 "pcmpgtb %%mm6, %%mm2 \n\t"
2687 "paddb %%mm2, %%mm0 \n\t" 2629 "paddb %%mm2, %%mm0 \n\t"
2688 2630
2689 "movq (%%"REG_a", %3), %%mm2 \n\t" 2631 "movq (%%"REG_a", %3), %%mm2 \n\t"
2690 PMAXUB(%%mm2, %%mm4) 2632 PMAXUB(%%mm2, %%mm4)
2691 PMINUB(%%mm2, %%mm3, %%mm5) 2633 PMINUB(%%mm2, %%mm3, %%mm5)
2692 "psubb %%mm2, %%mm1 \n\t" 2634 "psubb %%mm2, %%mm1 \n\t"
2693 "paddb %%mm7, %%mm1 \n\t" 2635 "paddb %%mm7, %%mm1 \n\t"
2694 "pcmpgtb %%mm6, %%mm1 \n\t" 2636 "pcmpgtb %%mm6, %%mm1 \n\t"
2695 "paddb %%mm1, %%mm0 \n\t" 2637 "paddb %%mm1, %%mm0 \n\t"
2696 2638
2697 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 2639 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2698 PMAXUB(%%mm1, %%mm4) 2640 PMAXUB(%%mm1, %%mm4)
2699 PMINUB(%%mm1, %%mm3, %%mm5) 2641 PMINUB(%%mm1, %%mm3, %%mm5)
2700 "psubb %%mm1, %%mm2 \n\t" 2642 "psubb %%mm1, %%mm2 \n\t"
2701 "paddb %%mm7, %%mm2 \n\t" 2643 "paddb %%mm7, %%mm2 \n\t"
2702 "pcmpgtb %%mm6, %%mm2 \n\t" 2644 "pcmpgtb %%mm6, %%mm2 \n\t"
2703 "paddb %%mm2, %%mm0 \n\t" 2645 "paddb %%mm2, %%mm0 \n\t"
2704 2646
2705 "movq (%2, %3, 8), %%mm2 \n\t" 2647 "movq (%2, %3, 8), %%mm2 \n\t"
2706 PMAXUB(%%mm2, %%mm4) 2648 PMAXUB(%%mm2, %%mm4)
2707 PMINUB(%%mm2, %%mm3, %%mm5) 2649 PMINUB(%%mm2, %%mm3, %%mm5)
2708 "psubb %%mm2, %%mm1 \n\t" 2650 "psubb %%mm2, %%mm1 \n\t"
2709 "paddb %%mm7, %%mm1 \n\t" 2651 "paddb %%mm7, %%mm1 \n\t"
2710 "pcmpgtb %%mm6, %%mm1 \n\t" 2652 "pcmpgtb %%mm6, %%mm1 \n\t"
2711 "paddb %%mm1, %%mm0 \n\t" 2653 "paddb %%mm1, %%mm0 \n\t"
2712 2654
2713 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" 2655 "movq (%%"REG_a", %3, 4), %%mm1 \n\t"
2714 "psubb %%mm1, %%mm2 \n\t" 2656 "psubb %%mm1, %%mm2 \n\t"
2715 "paddb %%mm7, %%mm2 \n\t" 2657 "paddb %%mm7, %%mm2 \n\t"
2716 "pcmpgtb %%mm6, %%mm2 \n\t" 2658 "pcmpgtb %%mm6, %%mm2 \n\t"
2717 "paddb %%mm2, %%mm0 \n\t" 2659 "paddb %%mm2, %%mm0 \n\t"
2718 "psubusb %%mm3, %%mm4 \n\t" 2660 "psubusb %%mm3, %%mm4 \n\t"
2719 2661
2720 "pxor %%mm6, %%mm6 \n\t" 2662 "pxor %%mm6, %%mm6 \n\t"
2721 "movq %4, %%mm7 \n\t" // QP,..., QP 2663 "movq %4, %%mm7 \n\t" // QP,..., QP
2722 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 2664 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2723 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 2665 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2724 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2666 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2725 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 2667 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2726 "movq %%mm7, %1 \n\t" 2668 "movq %%mm7, %1 \n\t"
2727 2669
2728 "movq %5, %%mm7 \n\t" 2670 "movq %5, %%mm7 \n\t"
2729 "punpcklbw %%mm7, %%mm7 \n\t" 2671 "punpcklbw %%mm7, %%mm7 \n\t"
2730 "punpcklbw %%mm7, %%mm7 \n\t" 2672 "punpcklbw %%mm7, %%mm7 \n\t"
2731 "punpcklbw %%mm7, %%mm7 \n\t" 2673 "punpcklbw %%mm7, %%mm7 \n\t"
2732 "psubb %%mm0, %%mm6 \n\t" 2674 "psubb %%mm0, %%mm6 \n\t"
2733 "pcmpgtb %%mm7, %%mm6 \n\t" 2675 "pcmpgtb %%mm7, %%mm6 \n\t"
2734 "movq %%mm6, %0 \n\t" 2676 "movq %%mm6, %0 \n\t"
2735 2677
2736 : "=m" (eq_mask), "=m" (dc_mask) 2678 : "=m" (eq_mask), "=m" (dc_mask)
2737 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) 2679 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2738 : "%"REG_a 2680 : "%"REG_a
2739 ); 2681 );
2740 2682
2741 both_masks = dc_mask & eq_mask; 2683 both_masks = dc_mask & eq_mask;
2742 2684
2743 if(both_masks){ 2685 if(both_masks){
2744 long offset= -8*step; 2686 long offset= -8*step;
2745 int64_t *temp_sums= sums; 2687 int64_t *temp_sums= sums;
2746 2688
2747 asm volatile( 2689 asm volatile(
2748 "movq %2, %%mm0 \n\t" // QP,..., QP 2690 "movq %2, %%mm0 \n\t" // QP,..., QP
2749 "pxor %%mm4, %%mm4 \n\t" 2691 "pxor %%mm4, %%mm4 \n\t"
2750 2692
2751 "movq (%0), %%mm6 \n\t" 2693 "movq (%0), %%mm6 \n\t"
2752 "movq (%0, %1), %%mm5 \n\t" 2694 "movq (%0, %1), %%mm5 \n\t"
2753 "movq %%mm5, %%mm1 \n\t" 2695 "movq %%mm5, %%mm1 \n\t"
2754 "movq %%mm6, %%mm2 \n\t" 2696 "movq %%mm6, %%mm2 \n\t"
2755 "psubusb %%mm6, %%mm5 \n\t" 2697 "psubusb %%mm6, %%mm5 \n\t"
2756 "psubusb %%mm1, %%mm2 \n\t" 2698 "psubusb %%mm1, %%mm2 \n\t"
2757 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2699 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2758 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2700 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2759 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2701 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2760 2702
2761 "pxor %%mm6, %%mm1 \n\t" 2703 "pxor %%mm6, %%mm1 \n\t"
2762 "pand %%mm0, %%mm1 \n\t" 2704 "pand %%mm0, %%mm1 \n\t"
2763 "pxor %%mm1, %%mm6 \n\t" 2705 "pxor %%mm1, %%mm6 \n\t"
2764 // 0:QP 6:First 2706 // 0:QP 6:First
2765 2707
2766 "movq (%0, %1, 8), %%mm5 \n\t" 2708 "movq (%0, %1, 8), %%mm5 \n\t"
2767 "add %1, %0 \n\t" // %0 points to line 1 not 0 2709 "add %1, %0 \n\t" // %0 points to line 1 not 0
2768 "movq (%0, %1, 8), %%mm7 \n\t" 2710 "movq (%0, %1, 8), %%mm7 \n\t"
2769 "movq %%mm5, %%mm1 \n\t" 2711 "movq %%mm5, %%mm1 \n\t"
2770 "movq %%mm7, %%mm2 \n\t" 2712 "movq %%mm7, %%mm2 \n\t"
2771 "psubusb %%mm7, %%mm5 \n\t" 2713 "psubusb %%mm7, %%mm5 \n\t"
2772 "psubusb %%mm1, %%mm2 \n\t" 2714 "psubusb %%mm1, %%mm2 \n\t"
2773 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 2715 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2774 "movq %2, %%mm0 \n\t" // QP,..., QP 2716 "movq %2, %%mm0 \n\t" // QP,..., QP
2775 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 2717 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2776 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 2718 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2777 2719
2778 "pxor %%mm7, %%mm1 \n\t" 2720 "pxor %%mm7, %%mm1 \n\t"
2779 "pand %%mm0, %%mm1 \n\t" 2721 "pand %%mm0, %%mm1 \n\t"
2780 "pxor %%mm1, %%mm7 \n\t" 2722 "pxor %%mm1, %%mm7 \n\t"
2781 2723
2782 "movq %%mm6, %%mm5 \n\t" 2724 "movq %%mm6, %%mm5 \n\t"
2783 "punpckhbw %%mm4, %%mm6 \n\t" 2725 "punpckhbw %%mm4, %%mm6 \n\t"
2784 "punpcklbw %%mm4, %%mm5 \n\t" 2726 "punpcklbw %%mm4, %%mm5 \n\t"
2785 // 4:0 5/6:First 7:Last 2727 // 4:0 5/6:First 7:Last
2786 2728
2787 "movq %%mm5, %%mm0 \n\t" 2729 "movq %%mm5, %%mm0 \n\t"
2788 "movq %%mm6, %%mm1 \n\t" 2730 "movq %%mm6, %%mm1 \n\t"
2789 "psllw $2, %%mm0 \n\t" 2731 "psllw $2, %%mm0 \n\t"
2790 "psllw $2, %%mm1 \n\t" 2732 "psllw $2, %%mm1 \n\t"
2791 "paddw "MANGLE(w04)", %%mm0 \n\t" 2733 "paddw "MANGLE(w04)", %%mm0 \n\t"
2792 "paddw "MANGLE(w04)", %%mm1 \n\t" 2734 "paddw "MANGLE(w04)", %%mm1 \n\t"
2793 2735
2794 #define NEXT\ 2736 #define NEXT\
2795 "movq (%0), %%mm2 \n\t"\ 2737 "movq (%0), %%mm2 \n\t"\
2796 "movq (%0), %%mm3 \n\t"\ 2738 "movq (%0), %%mm3 \n\t"\
2797 "add %1, %0 \n\t"\ 2739 "add %1, %0 \n\t"\
2798 "punpcklbw %%mm4, %%mm2 \n\t"\ 2740 "punpcklbw %%mm4, %%mm2 \n\t"\
2799 "punpckhbw %%mm4, %%mm3 \n\t"\ 2741 "punpckhbw %%mm4, %%mm3 \n\t"\
2800 "paddw %%mm2, %%mm0 \n\t"\ 2742 "paddw %%mm2, %%mm0 \n\t"\
2801 "paddw %%mm3, %%mm1 \n\t" 2743 "paddw %%mm3, %%mm1 \n\t"
2802 2744
2803 #define PREV\ 2745 #define PREV\
2804 "movq (%0), %%mm2 \n\t"\ 2746 "movq (%0), %%mm2 \n\t"\
2805 "movq (%0), %%mm3 \n\t"\ 2747 "movq (%0), %%mm3 \n\t"\
2806 "add %1, %0 \n\t"\ 2748 "add %1, %0 \n\t"\
2807 "punpcklbw %%mm4, %%mm2 \n\t"\ 2749 "punpcklbw %%mm4, %%mm2 \n\t"\
2808 "punpckhbw %%mm4, %%mm3 \n\t"\ 2750 "punpckhbw %%mm4, %%mm3 \n\t"\
2809 "psubw %%mm2, %%mm0 \n\t"\ 2751 "psubw %%mm2, %%mm0 \n\t"\
2810 "psubw %%mm3, %%mm1 \n\t" 2752 "psubw %%mm3, %%mm1 \n\t"
2811 2753
2812 2754
2813 NEXT //0 2755 NEXT //0
2814 NEXT //1 2756 NEXT //1
2815 NEXT //2 2757 NEXT //2
2816 "movq %%mm0, (%3) \n\t" 2758 "movq %%mm0, (%3) \n\t"
2817 "movq %%mm1, 8(%3) \n\t" 2759 "movq %%mm1, 8(%3) \n\t"
2818 2760
2819 NEXT //3 2761 NEXT //3
2820 "psubw %%mm5, %%mm0 \n\t" 2762 "psubw %%mm5, %%mm0 \n\t"
2821 "psubw %%mm6, %%mm1 \n\t" 2763 "psubw %%mm6, %%mm1 \n\t"
2822 "movq %%mm0, 16(%3) \n\t" 2764 "movq %%mm0, 16(%3) \n\t"
2823 "movq %%mm1, 24(%3) \n\t" 2765 "movq %%mm1, 24(%3) \n\t"
2824 2766
2825 NEXT //4 2767 NEXT //4
2826 "psubw %%mm5, %%mm0 \n\t" 2768 "psubw %%mm5, %%mm0 \n\t"
2827 "psubw %%mm6, %%mm1 \n\t" 2769 "psubw %%mm6, %%mm1 \n\t"
2828 "movq %%mm0, 32(%3) \n\t" 2770 "movq %%mm0, 32(%3) \n\t"
2829 "movq %%mm1, 40(%3) \n\t" 2771 "movq %%mm1, 40(%3) \n\t"
2830 2772
2831 NEXT //5 2773 NEXT //5
2832 "psubw %%mm5, %%mm0 \n\t" 2774 "psubw %%mm5, %%mm0 \n\t"
2833 "psubw %%mm6, %%mm1 \n\t" 2775 "psubw %%mm6, %%mm1 \n\t"
2834 "movq %%mm0, 48(%3) \n\t" 2776 "movq %%mm0, 48(%3) \n\t"
2835 "movq %%mm1, 56(%3) \n\t" 2777 "movq %%mm1, 56(%3) \n\t"
2836 2778
2837 NEXT //6 2779 NEXT //6
2838 "psubw %%mm5, %%mm0 \n\t" 2780 "psubw %%mm5, %%mm0 \n\t"
2839 "psubw %%mm6, %%mm1 \n\t" 2781 "psubw %%mm6, %%mm1 \n\t"
2840 "movq %%mm0, 64(%3) \n\t" 2782 "movq %%mm0, 64(%3) \n\t"
2841 "movq %%mm1, 72(%3) \n\t" 2783 "movq %%mm1, 72(%3) \n\t"
2842 2784
2843 "movq %%mm7, %%mm6 \n\t" 2785 "movq %%mm7, %%mm6 \n\t"
2844 "punpckhbw %%mm4, %%mm7 \n\t" 2786 "punpckhbw %%mm4, %%mm7 \n\t"
2845 "punpcklbw %%mm4, %%mm6 \n\t" 2787 "punpcklbw %%mm4, %%mm6 \n\t"
2846 2788
2847 NEXT //7 2789 NEXT //7
2848 "mov %4, %0 \n\t" 2790 "mov %4, %0 \n\t"
2849 "add %1, %0 \n\t" 2791 "add %1, %0 \n\t"
2850 PREV //0 2792 PREV //0
2851 "movq %%mm0, 80(%3) \n\t" 2793 "movq %%mm0, 80(%3) \n\t"
2852 "movq %%mm1, 88(%3) \n\t" 2794 "movq %%mm1, 88(%3) \n\t"
2853 2795
2854 PREV //1 2796 PREV //1
2855 "paddw %%mm6, %%mm0 \n\t" 2797 "paddw %%mm6, %%mm0 \n\t"
2856 "paddw %%mm7, %%mm1 \n\t" 2798 "paddw %%mm7, %%mm1 \n\t"
2857 "movq %%mm0, 96(%3) \n\t" 2799 "movq %%mm0, 96(%3) \n\t"
2858 "movq %%mm1, 104(%3) \n\t" 2800 "movq %%mm1, 104(%3) \n\t"
2859 2801
2860 PREV //2 2802 PREV //2
2861 "paddw %%mm6, %%mm0 \n\t" 2803 "paddw %%mm6, %%mm0 \n\t"
2862 "paddw %%mm7, %%mm1 \n\t" 2804 "paddw %%mm7, %%mm1 \n\t"
2863 "movq %%mm0, 112(%3) \n\t" 2805 "movq %%mm0, 112(%3) \n\t"
2864 "movq %%mm1, 120(%3) \n\t" 2806 "movq %%mm1, 120(%3) \n\t"
2865 2807
2866 PREV //3 2808 PREV //3
2867 "paddw %%mm6, %%mm0 \n\t" 2809 "paddw %%mm6, %%mm0 \n\t"
2868 "paddw %%mm7, %%mm1 \n\t" 2810 "paddw %%mm7, %%mm1 \n\t"
2869 "movq %%mm0, 128(%3) \n\t" 2811 "movq %%mm0, 128(%3) \n\t"
2870 "movq %%mm1, 136(%3) \n\t" 2812 "movq %%mm1, 136(%3) \n\t"
2871 2813
2872 PREV //4 2814 PREV //4
2873 "paddw %%mm6, %%mm0 \n\t" 2815 "paddw %%mm6, %%mm0 \n\t"
2874 "paddw %%mm7, %%mm1 \n\t" 2816 "paddw %%mm7, %%mm1 \n\t"
2875 "movq %%mm0, 144(%3) \n\t" 2817 "movq %%mm0, 144(%3) \n\t"
2876 "movq %%mm1, 152(%3) \n\t" 2818 "movq %%mm1, 152(%3) \n\t"
2877 2819
2878 "mov %4, %0 \n\t" //FIXME 2820 "mov %4, %0 \n\t" //FIXME
2879 2821
2880 : "+&r"(src) 2822 : "+&r"(src)
2881 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src) 2823 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
2882 ); 2824 );
2883 2825
2884 src+= step; // src points to begin of the 8x8 Block 2826 src+= step; // src points to begin of the 8x8 Block
2885 2827
2886 asm volatile( 2828 asm volatile(
2887 "movq %4, %%mm6 \n\t" 2829 "movq %4, %%mm6 \n\t"
2888 "pcmpeqb %%mm5, %%mm5 \n\t" 2830 "pcmpeqb %%mm5, %%mm5 \n\t"
2889 "pxor %%mm6, %%mm5 \n\t" 2831 "pxor %%mm6, %%mm5 \n\t"
2890 "pxor %%mm7, %%mm7 \n\t" 2832 "pxor %%mm7, %%mm7 \n\t"
2891 2833
2892 "1: \n\t" 2834 "1: \n\t"
2893 "movq (%1), %%mm0 \n\t" 2835 "movq (%1), %%mm0 \n\t"
2894 "movq 8(%1), %%mm1 \n\t" 2836 "movq 8(%1), %%mm1 \n\t"
2895 "paddw 32(%1), %%mm0 \n\t" 2837 "paddw 32(%1), %%mm0 \n\t"
2896 "paddw 40(%1), %%mm1 \n\t" 2838 "paddw 40(%1), %%mm1 \n\t"
2897 "movq (%0, %3), %%mm2 \n\t" 2839 "movq (%0, %3), %%mm2 \n\t"
2898 "movq %%mm2, %%mm3 \n\t" 2840 "movq %%mm2, %%mm3 \n\t"
2899 "movq %%mm2, %%mm4 \n\t" 2841 "movq %%mm2, %%mm4 \n\t"
2900 "punpcklbw %%mm7, %%mm2 \n\t" 2842 "punpcklbw %%mm7, %%mm2 \n\t"
2901 "punpckhbw %%mm7, %%mm3 \n\t" 2843 "punpckhbw %%mm7, %%mm3 \n\t"
2902 "paddw %%mm2, %%mm0 \n\t" 2844 "paddw %%mm2, %%mm0 \n\t"
2903 "paddw %%mm3, %%mm1 \n\t" 2845 "paddw %%mm3, %%mm1 \n\t"
2904 "paddw %%mm2, %%mm0 \n\t" 2846 "paddw %%mm2, %%mm0 \n\t"
2905 "paddw %%mm3, %%mm1 \n\t" 2847 "paddw %%mm3, %%mm1 \n\t"
2906 "psrlw $4, %%mm0 \n\t" 2848 "psrlw $4, %%mm0 \n\t"
2907 "psrlw $4, %%mm1 \n\t" 2849 "psrlw $4, %%mm1 \n\t"
2908 "packuswb %%mm1, %%mm0 \n\t" 2850 "packuswb %%mm1, %%mm0 \n\t"
2909 "pand %%mm6, %%mm0 \n\t" 2851 "pand %%mm6, %%mm0 \n\t"
2910 "pand %%mm5, %%mm4 \n\t" 2852 "pand %%mm5, %%mm4 \n\t"
2911 "por %%mm4, %%mm0 \n\t" 2853 "por %%mm4, %%mm0 \n\t"
2912 "movq %%mm0, (%0, %3) \n\t" 2854 "movq %%mm0, (%0, %3) \n\t"
2913 "add $16, %1 \n\t" 2855 "add $16, %1 \n\t"
2914 "add %2, %0 \n\t" 2856 "add %2, %0 \n\t"
2915 " js 1b \n\t" 2857 " js 1b \n\t"
2916 2858
2917 : "+r"(offset), "+r"(temp_sums) 2859 : "+r"(offset), "+r"(temp_sums)
2918 : "r" ((long)step), "r"(src - offset), "m"(both_masks) 2860 : "r" ((long)step), "r"(src - offset), "m"(both_masks)
2919 ); 2861 );
2920 }else 2862 }else
2921 src+= step; // src points to begin of the 8x8 Block 2863 src+= step; // src points to begin of the 8x8 Block
2922 2864
2923 if(eq_mask != -1LL){ 2865 if(eq_mask != -1LL){
2924 uint8_t *temp_src= src; 2866 uint8_t *temp_src= src;
2925 asm volatile( 2867 asm volatile(
2926 "pxor %%mm7, %%mm7 \n\t" 2868 "pxor %%mm7, %%mm7 \n\t"
2927 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 2869 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
2928 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 2870 "and "ALIGN_MASK", %%"REG_c" \n\t" // align
2929 // 0 1 2 3 4 5 6 7 8 9 2871 // 0 1 2 3 4 5 6 7 8 9
2930 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 2872 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2931 2873
2932 "movq (%0), %%mm0 \n\t" 2874 "movq (%0), %%mm0 \n\t"
2933 "movq %%mm0, %%mm1 \n\t" 2875 "movq %%mm0, %%mm1 \n\t"
2934 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 2876 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2935 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 2877 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2936 2878
2937 "movq (%0, %1), %%mm2 \n\t" 2879 "movq (%0, %1), %%mm2 \n\t"
2938 "lea (%0, %1, 2), %%"REG_a" \n\t" 2880 "lea (%0, %1, 2), %%"REG_a" \n\t"
2939 "movq %%mm2, %%mm3 \n\t" 2881 "movq %%mm2, %%mm3 \n\t"
2940 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 2882 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2941 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 2883 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2942 2884
2943 "movq (%%"REG_a"), %%mm4 \n\t" 2885 "movq (%%"REG_a"), %%mm4 \n\t"
2944 "movq %%mm4, %%mm5 \n\t" 2886 "movq %%mm4, %%mm5 \n\t"
2945 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 2887 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2946 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 2888 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2947 2889
2948 "paddw %%mm0, %%mm0 \n\t" // 2L0 2890 "paddw %%mm0, %%mm0 \n\t" // 2L0
2949 "paddw %%mm1, %%mm1 \n\t" // 2H0 2891 "paddw %%mm1, %%mm1 \n\t" // 2H0
2950 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 2892 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2951 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 2893 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2952 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 2894 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2953 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 2895 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2954 2896
2955 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 2897 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2956 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 2898 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2957 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 2899 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2958 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 2900 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2959 2901
2960 "movq (%%"REG_a", %1), %%mm2 \n\t" 2902 "movq (%%"REG_a", %1), %%mm2 \n\t"
2961 "movq %%mm2, %%mm3 \n\t" 2903 "movq %%mm2, %%mm3 \n\t"
2962 "punpcklbw %%mm7, %%mm2 \n\t" // L3 2904 "punpcklbw %%mm7, %%mm2 \n\t" // L3
2963 "punpckhbw %%mm7, %%mm3 \n\t" // H3 2905 "punpckhbw %%mm7, %%mm3 \n\t" // H3
2964 2906
2965 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 2907 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2966 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 2908 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2967 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2909 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2968 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2910 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2969 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2911 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2970 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2912 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2971 2913
2972 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 2914 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
2973 "movq %%mm0, %%mm1 \n\t" 2915 "movq %%mm0, %%mm1 \n\t"
2974 "punpcklbw %%mm7, %%mm0 \n\t" // L4 2916 "punpcklbw %%mm7, %%mm0 \n\t" // L4
2975 "punpckhbw %%mm7, %%mm1 \n\t" // H4 2917 "punpckhbw %%mm7, %%mm1 \n\t" // H4
2976 2918
2977 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 2919 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2978 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 2920 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2979 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 2921 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
2980 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 2922 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
2981 "paddw %%mm4, %%mm4 \n\t" // 2L2 2923 "paddw %%mm4, %%mm4 \n\t" // 2L2
2982 "paddw %%mm5, %%mm5 \n\t" // 2H2 2924 "paddw %%mm5, %%mm5 \n\t" // 2H2
2983 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 2925 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2984 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 2926 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2985 2927
2986 "lea (%%"REG_a", %1), %0 \n\t" 2928 "lea (%%"REG_a", %1), %0 \n\t"
2987 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 2929 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2988 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 2930 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2989 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 2931 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2990 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 2932 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2991 //50 opcodes so far 2933 //50 opcodes so far
2992 "movq (%0, %1, 2), %%mm2 \n\t" 2934 "movq (%0, %1, 2), %%mm2 \n\t"
2993 "movq %%mm2, %%mm3 \n\t" 2935 "movq %%mm2, %%mm3 \n\t"
2994 "punpcklbw %%mm7, %%mm2 \n\t" // L5 2936 "punpcklbw %%mm7, %%mm2 \n\t" // L5
2995 "punpckhbw %%mm7, %%mm3 \n\t" // H5 2937 "punpckhbw %%mm7, %%mm3 \n\t" // H5
2996 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 2938 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2997 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 2939 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2998 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 2940 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2999 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 2941 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
3000 2942
3001 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 2943 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
3002 "punpcklbw %%mm7, %%mm6 \n\t" // L6 2944 "punpcklbw %%mm7, %%mm6 \n\t" // L6
3003 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 2945 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
3004 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 2946 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
3005 "punpckhbw %%mm7, %%mm6 \n\t" // H6 2947 "punpckhbw %%mm7, %%mm6 \n\t" // H6
3006 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 2948 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
3007 2949
3008 "paddw %%mm0, %%mm0 \n\t" // 2L4 2950 "paddw %%mm0, %%mm0 \n\t" // 2L4
3009 "paddw %%mm1, %%mm1 \n\t" // 2H4 2951 "paddw %%mm1, %%mm1 \n\t" // 2H4
3010 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 2952 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
3011 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 2953 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
3012 2954
3013 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 2955 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
3014 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 2956 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
3015 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 2957 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
3016 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 2958 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
3017 2959
3018 "movq (%0, %1, 4), %%mm2 \n\t" 2960 "movq (%0, %1, 4), %%mm2 \n\t"
3019 "movq %%mm2, %%mm3 \n\t" 2961 "movq %%mm2, %%mm3 \n\t"
3020 "punpcklbw %%mm7, %%mm2 \n\t" // L7 2962 "punpcklbw %%mm7, %%mm2 \n\t" // L7
3021 "punpckhbw %%mm7, %%mm3 \n\t" // H7 2963 "punpckhbw %%mm7, %%mm3 \n\t" // H7
3022 2964
3023 "paddw %%mm2, %%mm2 \n\t" // 2L7 2965 "paddw %%mm2, %%mm2 \n\t" // 2L7
3024 "paddw %%mm3, %%mm3 \n\t" // 2H7 2966 "paddw %%mm3, %%mm3 \n\t" // 2H7
3025 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 2967 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3026 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 2968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3027 2969
3028 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2970 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3029 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2971 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3030 2972
3031 #ifdef HAVE_MMX2 2973 #ifdef HAVE_MMX2
3032 "movq %%mm7, %%mm6 \n\t" // 0 2974 "movq %%mm7, %%mm6 \n\t" // 0
3033 "psubw %%mm0, %%mm6 \n\t" 2975 "psubw %%mm0, %%mm6 \n\t"
3034 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2976 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3035 "movq %%mm7, %%mm6 \n\t" // 0 2977 "movq %%mm7, %%mm6 \n\t" // 0
3036 "psubw %%mm1, %%mm6 \n\t" 2978 "psubw %%mm1, %%mm6 \n\t"
3037 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 2979 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3038 "movq %%mm7, %%mm6 \n\t" // 0 2980 "movq %%mm7, %%mm6 \n\t" // 0
3039 "psubw %%mm2, %%mm6 \n\t" 2981 "psubw %%mm2, %%mm6 \n\t"
3040 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 2982 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3041 "movq %%mm7, %%mm6 \n\t" // 0 2983 "movq %%mm7, %%mm6 \n\t" // 0
3042 "psubw %%mm3, %%mm6 \n\t" 2984 "psubw %%mm3, %%mm6 \n\t"
3043 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 2985 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3044 #else 2986 #else
3045 "movq %%mm7, %%mm6 \n\t" // 0 2987 "movq %%mm7, %%mm6 \n\t" // 0
3046 "pcmpgtw %%mm0, %%mm6 \n\t" 2988 "pcmpgtw %%mm0, %%mm6 \n\t"
3047 "pxor %%mm6, %%mm0 \n\t" 2989 "pxor %%mm6, %%mm0 \n\t"
3048 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2990 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3049 "movq %%mm7, %%mm6 \n\t" // 0 2991 "movq %%mm7, %%mm6 \n\t" // 0
3050 "pcmpgtw %%mm1, %%mm6 \n\t" 2992 "pcmpgtw %%mm1, %%mm6 \n\t"
3051 "pxor %%mm6, %%mm1 \n\t" 2993 "pxor %%mm6, %%mm1 \n\t"
3052 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 2994 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3053 "movq %%mm7, %%mm6 \n\t" // 0 2995 "movq %%mm7, %%mm6 \n\t" // 0
3054 "pcmpgtw %%mm2, %%mm6 \n\t" 2996 "pcmpgtw %%mm2, %%mm6 \n\t"
3055 "pxor %%mm6, %%mm2 \n\t" 2997 "pxor %%mm6, %%mm2 \n\t"
3056 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 2998 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3057 "movq %%mm7, %%mm6 \n\t" // 0 2999 "movq %%mm7, %%mm6 \n\t" // 0
3058 "pcmpgtw %%mm3, %%mm6 \n\t" 3000 "pcmpgtw %%mm3, %%mm6 \n\t"
3059 "pxor %%mm6, %%mm3 \n\t" 3001 "pxor %%mm6, %%mm3 \n\t"
3060 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 3002 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3061 #endif 3003 #endif
3062 3004
3063 #ifdef HAVE_MMX2 3005 #ifdef HAVE_MMX2
3064 "pminsw %%mm2, %%mm0 \n\t" 3006 "pminsw %%mm2, %%mm0 \n\t"
3065 "pminsw %%mm3, %%mm1 \n\t" 3007 "pminsw %%mm3, %%mm1 \n\t"
3066 #else 3008 #else
3067 "movq %%mm0, %%mm6 \n\t" 3009 "movq %%mm0, %%mm6 \n\t"
3068 "psubusw %%mm2, %%mm6 \n\t" 3010 "psubusw %%mm2, %%mm6 \n\t"
3069 "psubw %%mm6, %%mm0 \n\t" 3011 "psubw %%mm6, %%mm0 \n\t"
3070 "movq %%mm1, %%mm6 \n\t" 3012 "movq %%mm1, %%mm6 \n\t"
3071 "psubusw %%mm3, %%mm6 \n\t" 3013 "psubusw %%mm3, %%mm6 \n\t"
3072 "psubw %%mm6, %%mm1 \n\t" 3014 "psubw %%mm6, %%mm1 \n\t"
3073 #endif 3015 #endif
3074 3016
3075 "movd %2, %%mm2 \n\t" // QP 3017 "movd %2, %%mm2 \n\t" // QP
3076 "punpcklbw %%mm7, %%mm2 \n\t" 3018 "punpcklbw %%mm7, %%mm2 \n\t"
3077 3019
3078 "movq %%mm7, %%mm6 \n\t" // 0 3020 "movq %%mm7, %%mm6 \n\t" // 0
3079 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 3021 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3080 "pxor %%mm6, %%mm4 \n\t" 3022 "pxor %%mm6, %%mm4 \n\t"
3081 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 3023 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3082 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 3024 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3083 "pxor %%mm7, %%mm5 \n\t" 3025 "pxor %%mm7, %%mm5 \n\t"
3084 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 3026 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3085 // 100 opcodes 3027 // 100 opcodes
3086 "psllw $3, %%mm2 \n\t" // 8QP 3028 "psllw $3, %%mm2 \n\t" // 8QP
3087 "movq %%mm2, %%mm3 \n\t" // 8QP 3029 "movq %%mm2, %%mm3 \n\t" // 8QP
3088 "pcmpgtw %%mm4, %%mm2 \n\t" 3030 "pcmpgtw %%mm4, %%mm2 \n\t"
3089 "pcmpgtw %%mm5, %%mm3 \n\t" 3031 "pcmpgtw %%mm5, %%mm3 \n\t"
3090 "pand %%mm2, %%mm4 \n\t" 3032 "pand %%mm2, %%mm4 \n\t"
3091 "pand %%mm3, %%mm5 \n\t" 3033 "pand %%mm3, %%mm5 \n\t"
3092 3034
3093 3035
3094 "psubusw %%mm0, %%mm4 \n\t" // hd 3036 "psubusw %%mm0, %%mm4 \n\t" // hd
3095 "psubusw %%mm1, %%mm5 \n\t" // ld 3037 "psubusw %%mm1, %%mm5 \n\t" // ld
3096 3038
3097 3039
3098 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 3040 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3099 "pmullw %%mm2, %%mm4 \n\t" 3041 "pmullw %%mm2, %%mm4 \n\t"
3100 "pmullw %%mm2, %%mm5 \n\t" 3042 "pmullw %%mm2, %%mm5 \n\t"
3101 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 3043 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3102 "paddw %%mm2, %%mm4 \n\t" 3044 "paddw %%mm2, %%mm4 \n\t"
3103 "paddw %%mm2, %%mm5 \n\t" 3045 "paddw %%mm2, %%mm5 \n\t"
3104 "psrlw $6, %%mm4 \n\t" 3046 "psrlw $6, %%mm4 \n\t"
3105 "psrlw $6, %%mm5 \n\t" 3047 "psrlw $6, %%mm5 \n\t"
3106 3048
3107 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 3049 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
3108 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 3050 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
3109 3051
3110 "pxor %%mm2, %%mm2 \n\t" 3052 "pxor %%mm2, %%mm2 \n\t"
3111 "pxor %%mm3, %%mm3 \n\t" 3053 "pxor %%mm3, %%mm3 \n\t"
3112 3054
3113 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 3055 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3114 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 3056 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3115 "pxor %%mm2, %%mm0 \n\t" 3057 "pxor %%mm2, %%mm0 \n\t"
3116 "pxor %%mm3, %%mm1 \n\t" 3058 "pxor %%mm3, %%mm1 \n\t"
3117 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 3059 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3118 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 3060 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3119 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 3061 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3120 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 3062 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3121 3063
3122 "pxor %%mm6, %%mm2 \n\t" 3064 "pxor %%mm6, %%mm2 \n\t"
3123 "pxor %%mm7, %%mm3 \n\t" 3065 "pxor %%mm7, %%mm3 \n\t"
3124 "pand %%mm2, %%mm4 \n\t" 3066 "pand %%mm2, %%mm4 \n\t"
3125 "pand %%mm3, %%mm5 \n\t" 3067 "pand %%mm3, %%mm5 \n\t"
3126 3068
3127 #ifdef HAVE_MMX2 3069 #ifdef HAVE_MMX2
3128 "pminsw %%mm0, %%mm4 \n\t" 3070 "pminsw %%mm0, %%mm4 \n\t"
3129 "pminsw %%mm1, %%mm5 \n\t" 3071 "pminsw %%mm1, %%mm5 \n\t"
3130 #else 3072 #else
3131 "movq %%mm4, %%mm2 \n\t" 3073 "movq %%mm4, %%mm2 \n\t"
3132 "psubusw %%mm0, %%mm2 \n\t" 3074 "psubusw %%mm0, %%mm2 \n\t"
3133 "psubw %%mm2, %%mm4 \n\t" 3075 "psubw %%mm2, %%mm4 \n\t"
3134 "movq %%mm5, %%mm2 \n\t" 3076 "movq %%mm5, %%mm2 \n\t"
3135 "psubusw %%mm1, %%mm2 \n\t" 3077 "psubusw %%mm1, %%mm2 \n\t"
3136 "psubw %%mm2, %%mm5 \n\t" 3078 "psubw %%mm2, %%mm5 \n\t"
3137 #endif 3079 #endif
3138 "pxor %%mm6, %%mm4 \n\t" 3080 "pxor %%mm6, %%mm4 \n\t"
3139 "pxor %%mm7, %%mm5 \n\t" 3081 "pxor %%mm7, %%mm5 \n\t"
3140 "psubw %%mm6, %%mm4 \n\t" 3082 "psubw %%mm6, %%mm4 \n\t"
3141 "psubw %%mm7, %%mm5 \n\t" 3083 "psubw %%mm7, %%mm5 \n\t"
3142 "packsswb %%mm5, %%mm4 \n\t" 3084 "packsswb %%mm5, %%mm4 \n\t"
3143 "movq %3, %%mm1 \n\t" 3085 "movq %3, %%mm1 \n\t"
3144 "pandn %%mm4, %%mm1 \n\t" 3086 "pandn %%mm4, %%mm1 \n\t"
3145 "movq (%0), %%mm0 \n\t" 3087 "movq (%0), %%mm0 \n\t"
3146 "paddb %%mm1, %%mm0 \n\t" 3088 "paddb %%mm1, %%mm0 \n\t"
3147 "movq %%mm0, (%0) \n\t" 3089 "movq %%mm0, (%0) \n\t"
3148 "movq (%0, %1), %%mm0 \n\t" 3090 "movq (%0, %1), %%mm0 \n\t"
3149 "psubb %%mm1, %%mm0 \n\t" 3091 "psubb %%mm1, %%mm0 \n\t"
3150 "movq %%mm0, (%0, %1) \n\t" 3092 "movq %%mm0, (%0, %1) \n\t"
3151 3093
3152 : "+r" (temp_src) 3094 : "+r" (temp_src)
3153 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask) 3095 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
3154 : "%"REG_a, "%"REG_c 3096 : "%"REG_a, "%"REG_c
3155 ); 3097 );
3156 } 3098 }
3157 /*if(step==16){ 3099 /*if(step==16){
3158 STOP_TIMER("step16") 3100 STOP_TIMER("step16")
3159 }else{ 3101 }else{
3160 STOP_TIMER("stepX") 3102 STOP_TIMER("stepX")
3161 }*/ 3103 }*/
3162 } 3104 }
3163 #endif //HAVE_MMX 3105 #endif //HAVE_MMX
3164 3106
3165 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3107 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3166 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); 3108 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3167 3109
3168 /** 3110 /**
3169 * Copies a block from src to dst and fixes the blacklevel. 3111 * Copies a block from src to dst and fixes the blacklevel.
3170 * levelFix == 0 -> do not touch the brighness & contrast 3112 * levelFix == 0 -> do not touch the brighness & contrast
3171 */ 3113 */
3172 #undef SCALED_CPY 3114 #undef SCALED_CPY
3173 3115
3174 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, 3116 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3175 int levelFix, int64_t *packedOffsetAndScale) 3117 int levelFix, int64_t *packedOffsetAndScale)
3176 { 3118 {
3177 #ifndef HAVE_MMX 3119 #ifndef HAVE_MMX
3178 int i; 3120 int i;
3179 #endif 3121 #endif
3180 if(levelFix) 3122 if(levelFix){
3181 {
3182 #ifdef HAVE_MMX 3123 #ifdef HAVE_MMX
3183 asm volatile( 3124 asm volatile(
3184 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset 3125 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
3185 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale 3126 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
3186 "lea (%2,%4), %%"REG_a" \n\t" 3127 "lea (%2,%4), %%"REG_a" \n\t"
3187 "lea (%3,%5), %%"REG_d" \n\t" 3128 "lea (%3,%5), %%"REG_d" \n\t"
3188 "pxor %%mm4, %%mm4 \n\t" 3129 "pxor %%mm4, %%mm4 \n\t"
3189 #ifdef HAVE_MMX2 3130 #ifdef HAVE_MMX2
3190 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3131 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3191 "movq " #src1 ", %%mm0 \n\t"\ 3132 "movq " #src1 ", %%mm0 \n\t"\
3192 "movq " #src1 ", %%mm5 \n\t"\ 3133 "movq " #src1 ", %%mm5 \n\t"\
3193 "movq " #src2 ", %%mm1 \n\t"\ 3134 "movq " #src2 ", %%mm1 \n\t"\
3194 "movq " #src2 ", %%mm6 \n\t"\ 3135 "movq " #src2 ", %%mm6 \n\t"\
3195 "punpcklbw %%mm0, %%mm0 \n\t"\ 3136 "punpcklbw %%mm0, %%mm0 \n\t"\
3196 "punpckhbw %%mm5, %%mm5 \n\t"\ 3137 "punpckhbw %%mm5, %%mm5 \n\t"\
3197 "punpcklbw %%mm1, %%mm1 \n\t"\ 3138 "punpcklbw %%mm1, %%mm1 \n\t"\
3198 "punpckhbw %%mm6, %%mm6 \n\t"\ 3139 "punpckhbw %%mm6, %%mm6 \n\t"\
3199 "pmulhuw %%mm3, %%mm0 \n\t"\ 3140 "pmulhuw %%mm3, %%mm0 \n\t"\
3200 "pmulhuw %%mm3, %%mm5 \n\t"\ 3141 "pmulhuw %%mm3, %%mm5 \n\t"\
3201 "pmulhuw %%mm3, %%mm1 \n\t"\ 3142 "pmulhuw %%mm3, %%mm1 \n\t"\
3202 "pmulhuw %%mm3, %%mm6 \n\t"\ 3143 "pmulhuw %%mm3, %%mm6 \n\t"\
3203 "psubw %%mm2, %%mm0 \n\t"\ 3144 "psubw %%mm2, %%mm0 \n\t"\
3204 "psubw %%mm2, %%mm5 \n\t"\ 3145 "psubw %%mm2, %%mm5 \n\t"\
3205 "psubw %%mm2, %%mm1 \n\t"\ 3146 "psubw %%mm2, %%mm1 \n\t"\
3206 "psubw %%mm2, %%mm6 \n\t"\ 3147 "psubw %%mm2, %%mm6 \n\t"\
3207 "packuswb %%mm5, %%mm0 \n\t"\ 3148 "packuswb %%mm5, %%mm0 \n\t"\
3208 "packuswb %%mm6, %%mm1 \n\t"\ 3149 "packuswb %%mm6, %%mm1 \n\t"\
3209 "movq %%mm0, " #dst1 " \n\t"\ 3150 "movq %%mm0, " #dst1 " \n\t"\
3210 "movq %%mm1, " #dst2 " \n\t"\ 3151 "movq %%mm1, " #dst2 " \n\t"\
3211 3152
3212 #else //HAVE_MMX2 3153 #else //HAVE_MMX2
3213 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3154 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3214 "movq " #src1 ", %%mm0 \n\t"\ 3155 "movq " #src1 ", %%mm0 \n\t"\
3215 "movq " #src1 ", %%mm5 \n\t"\ 3156 "movq " #src1 ", %%mm5 \n\t"\
3216 "punpcklbw %%mm4, %%mm0 \n\t"\ 3157 "punpcklbw %%mm4, %%mm0 \n\t"\
3217 "punpckhbw %%mm4, %%mm5 \n\t"\ 3158 "punpckhbw %%mm4, %%mm5 \n\t"\
3218 "psubw %%mm2, %%mm0 \n\t"\ 3159 "psubw %%mm2, %%mm0 \n\t"\
3219 "psubw %%mm2, %%mm5 \n\t"\ 3160 "psubw %%mm2, %%mm5 \n\t"\
3220 "movq " #src2 ", %%mm1 \n\t"\ 3161 "movq " #src2 ", %%mm1 \n\t"\
3221 "psllw $6, %%mm0 \n\t"\ 3162 "psllw $6, %%mm0 \n\t"\
3222 "psllw $6, %%mm5 \n\t"\ 3163 "psllw $6, %%mm5 \n\t"\
3223 "pmulhw %%mm3, %%mm0 \n\t"\ 3164 "pmulhw %%mm3, %%mm0 \n\t"\
3224 "movq " #src2 ", %%mm6 \n\t"\ 3165 "movq " #src2 ", %%mm6 \n\t"\
3225 "pmulhw %%mm3, %%mm5 \n\t"\ 3166 "pmulhw %%mm3, %%mm5 \n\t"\
3226 "punpcklbw %%mm4, %%mm1 \n\t"\ 3167 "punpcklbw %%mm4, %%mm1 \n\t"\
3227 "punpckhbw %%mm4, %%mm6 \n\t"\ 3168 "punpckhbw %%mm4, %%mm6 \n\t"\
3228 "psubw %%mm2, %%mm1 \n\t"\ 3169 "psubw %%mm2, %%mm1 \n\t"\
3229 "psubw %%mm2, %%mm6 \n\t"\ 3170 "psubw %%mm2, %%mm6 \n\t"\
3230 "psllw $6, %%mm1 \n\t"\ 3171 "psllw $6, %%mm1 \n\t"\
3231 "psllw $6, %%mm6 \n\t"\ 3172 "psllw $6, %%mm6 \n\t"\
3232 "pmulhw %%mm3, %%mm1 \n\t"\ 3173 "pmulhw %%mm3, %%mm1 \n\t"\
3233 "pmulhw %%mm3, %%mm6 \n\t"\ 3174 "pmulhw %%mm3, %%mm6 \n\t"\
3234 "packuswb %%mm5, %%mm0 \n\t"\ 3175 "packuswb %%mm5, %%mm0 \n\t"\
3235 "packuswb %%mm6, %%mm1 \n\t"\ 3176 "packuswb %%mm6, %%mm1 \n\t"\
3236 "movq %%mm0, " #dst1 " \n\t"\ 3177 "movq %%mm0, " #dst1 " \n\t"\
3237 "movq %%mm1, " #dst2 " \n\t"\ 3178 "movq %%mm1, " #dst2 " \n\t"\
3238 3179
3239 #endif //HAVE_MMX2 3180 #endif //HAVE_MMX2
3240 #define SCALED_CPY(src1, src2, dst1, dst2)\ 3181 #define SCALED_CPY(src1, src2, dst1, dst2)\
3241 REAL_SCALED_CPY(src1, src2, dst1, dst2) 3182 REAL_SCALED_CPY(src1, src2, dst1, dst2)
3242 3183
3243 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) 3184 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3244 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) 3185 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
3245 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) 3186 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
3246 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" 3187 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
3247 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" 3188 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
3248 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) 3189 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3249 3190
3250 3191
3251 : "=&a" (packedOffsetAndScale) 3192 : "=&a" (packedOffsetAndScale)
3252 : "0" (packedOffsetAndScale), 3193 : "0" (packedOffsetAndScale),
3253 "r"(src), 3194 "r"(src),
3254 "r"(dst), 3195 "r"(dst),
3255 "r" ((long)srcStride), 3196 "r" ((long)srcStride),
3256 "r" ((long)dstStride) 3197 "r" ((long)dstStride)
3257 : "%"REG_d 3198 : "%"REG_d
3258 ); 3199 );
3259 #else //HAVE_MMX 3200 #else //HAVE_MMX
3260 for(i=0; i<8; i++) 3201 for(i=0; i<8; i++)
3261 memcpy( &(dst[dstStride*i]), 3202 memcpy( &(dst[dstStride*i]),
3262 &(src[srcStride*i]), BLOCK_SIZE); 3203 &(src[srcStride*i]), BLOCK_SIZE);
3263 #endif //HAVE_MMX 3204 #endif //HAVE_MMX
3264 } 3205 }else{
3265 else
3266 {
3267 #ifdef HAVE_MMX 3206 #ifdef HAVE_MMX
3268 asm volatile( 3207 asm volatile(
3269 "lea (%0,%2), %%"REG_a" \n\t" 3208 "lea (%0,%2), %%"REG_a" \n\t"
3270 "lea (%1,%3), %%"REG_d" \n\t" 3209 "lea (%1,%3), %%"REG_d" \n\t"
3271 3210
3272 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ 3211 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3273 "movq " #src1 ", %%mm0 \n\t"\ 3212 "movq " #src1 ", %%mm0 \n\t"\
3274 "movq " #src2 ", %%mm1 \n\t"\ 3213 "movq " #src2 ", %%mm1 \n\t"\
3275 "movq %%mm0, " #dst1 " \n\t"\ 3214 "movq %%mm0, " #dst1 " \n\t"\
3276 "movq %%mm1, " #dst2 " \n\t"\ 3215 "movq %%mm1, " #dst2 " \n\t"\
3277 3216
3278 #define SIMPLE_CPY(src1, src2, dst1, dst2)\ 3217 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3279 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) 3218 REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3280 3219
3281 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 3220 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3282 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) 3221 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
3283 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) 3222 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
3284 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" 3223 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
3285 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" 3224 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
3286 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) 3225 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3287 3226
3288 : : "r" (src), 3227 : : "r" (src),
3289 "r" (dst), 3228 "r" (dst),
3290 "r" ((long)srcStride), 3229 "r" ((long)srcStride),
3291 "r" ((long)dstStride) 3230 "r" ((long)dstStride)
3292 : "%"REG_a, "%"REG_d 3231 : "%"REG_a, "%"REG_d
3293 ); 3232 );
3294 #else //HAVE_MMX 3233 #else //HAVE_MMX
3295 for(i=0; i<8; i++) 3234 for(i=0; i<8; i++)
3296 memcpy( &(dst[dstStride*i]), 3235 memcpy( &(dst[dstStride*i]),
3297 &(src[srcStride*i]), BLOCK_SIZE); 3236 &(src[srcStride*i]), BLOCK_SIZE);
3298 #endif //HAVE_MMX 3237 #endif //HAVE_MMX
3299 } 3238 }
3300 } 3239 }
3301 3240
3302 /** 3241 /**
3303 * Duplicates the given 8 src pixels ? times upward 3242 * Duplicates the given 8 src pixels ? times upward
3304 */ 3243 */
3305 static inline void RENAME(duplicate)(uint8_t src[], int stride) 3244 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3306 { 3245 {
3307 #ifdef HAVE_MMX 3246 #ifdef HAVE_MMX
3308 asm volatile( 3247 asm volatile(
3309 "movq (%0), %%mm0 \n\t" 3248 "movq (%0), %%mm0 \n\t"
3310 "add %1, %0 \n\t" 3249 "add %1, %0 \n\t"
3311 "movq %%mm0, (%0) \n\t" 3250 "movq %%mm0, (%0) \n\t"
3312 "movq %%mm0, (%0, %1) \n\t" 3251 "movq %%mm0, (%0, %1) \n\t"
3313 "movq %%mm0, (%0, %1, 2) \n\t" 3252 "movq %%mm0, (%0, %1, 2) \n\t"
3314 : "+r" (src) 3253 : "+r" (src)
3315 : "r" ((long)-stride) 3254 : "r" ((long)-stride)
3316 ); 3255 );
3317 #else 3256 #else
3318 int i; 3257 int i;
3319 uint8_t *p=src; 3258 uint8_t *p=src;
3320 for(i=0; i<3; i++) 3259 for(i=0; i<3; i++){
3321 { 3260 p-= stride;
3322 p-= stride; 3261 memcpy(p, src, 8);
3323 memcpy(p, src, 8); 3262 }
3324 }
3325 #endif 3263 #endif
3326 } 3264 }
3327 3265
3328 /** 3266 /**
3329 * Filters array of bytes (Y or U or V values) 3267 * Filters array of bytes (Y or U or V values)
3330 */ 3268 */
3331 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 3269 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3332 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) 3270 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3333 { 3271 {
3334 DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access 3272 DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3335 int x,y; 3273 int x,y;
3336 #ifdef COMPILE_TIME_MODE 3274 #ifdef COMPILE_TIME_MODE
3337 const int mode= COMPILE_TIME_MODE; 3275 const int mode= COMPILE_TIME_MODE;
3338 #else 3276 #else
3339 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; 3277 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3340 #endif 3278 #endif
3341 int black=0, white=255; // blackest black and whitest white in the picture 3279 int black=0, white=255; // blackest black and whitest white in the picture
3342 int QPCorrecture= 256*256; 3280 int QPCorrecture= 256*256;
3343 3281
3344 int copyAhead; 3282 int copyAhead;
3345 #ifdef HAVE_MMX 3283 #ifdef HAVE_MMX
3284 int i;
3285 #endif
3286
3287 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3288 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3289
3290 //FIXME remove
3291 uint64_t * const yHistogram= c.yHistogram;
3292 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3293 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3294 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3295
3296 #ifdef HAVE_MMX
3297 for(i=0; i<57; i++){
3298 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3299 int threshold= offset*2 + 1;
3300 c.mmxDcOffset[i]= 0x7F - offset;
3301 c.mmxDcThreshold[i]= 0x7F - threshold;
3302 c.mmxDcOffset[i]*= 0x0101010101010101LL;
3303 c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3304 }
3305 #endif
3306
3307 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3308 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3309 || (mode & FFMPEG_DEINT_FILTER)
3310 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3311 else if( (mode & V_DEBLOCK)
3312 || (mode & LINEAR_IPOL_DEINT_FILTER)
3313 || (mode & MEDIAN_DEINT_FILTER)
3314 || (mode & V_A_DEBLOCK)) copyAhead=13;
3315 else if(mode & V_X1_FILTER) copyAhead=11;
3316 // else if(mode & V_RK1_FILTER) copyAhead=10;
3317 else if(mode & DERING) copyAhead=9;
3318 else copyAhead=8;
3319
3320 copyAhead-= 8;
3321
3322 if(!isColor){
3323 uint64_t sum= 0;
3346 int i; 3324 int i;
3325 uint64_t maxClipped;
3326 uint64_t clipped;
3327 double scale;
3328
3329 c.frameNum++;
3330 // first frame is fscked so we ignore it
3331 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3332
3333 for(i=0; i<256; i++){
3334 sum+= yHistogram[i];
3335 }
3336
3337 /* We always get a completely black picture first. */
3338 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3339
3340 clipped= sum;
3341 for(black=255; black>0; black--){
3342 if(clipped < maxClipped) break;
3343 clipped-= yHistogram[black];
3344 }
3345
3346 clipped= sum;
3347 for(white=0; white<256; white++){
3348 if(clipped < maxClipped) break;
3349 clipped-= yHistogram[white];
3350 }
3351
3352 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3353
3354 #ifdef HAVE_MMX2
3355 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3356 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3357 #else
3358 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3359 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3347 #endif 3360 #endif
3348 3361
3349 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; 3362 c.packedYOffset|= c.packedYOffset<<32;
3350 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; 3363 c.packedYOffset|= c.packedYOffset<<16;
3351 3364
3352 //FIXME remove 3365 c.packedYScale|= c.packedYScale<<32;
3353 uint64_t * const yHistogram= c.yHistogram; 3366 c.packedYScale|= c.packedYScale<<16;
3354 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; 3367
3355 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; 3368 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
3356 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; 3369 else QPCorrecture= 256*256;
3357 3370 }else{
3358 #ifdef HAVE_MMX 3371 c.packedYScale= 0x0100010001000100LL;
3359 for(i=0; i<57; i++){ 3372 c.packedYOffset= 0;
3360 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; 3373 QPCorrecture= 256*256;
3361 int threshold= offset*2 + 1; 3374 }
3362 c.mmxDcOffset[i]= 0x7F - offset; 3375
3363 c.mmxDcThreshold[i]= 0x7F - threshold; 3376 /* copy & deinterlace first row of blocks */
3364 c.mmxDcOffset[i]*= 0x0101010101010101LL; 3377 y=-BLOCK_SIZE;
3365 c.mmxDcThreshold[i]*= 0x0101010101010101LL; 3378 {
3366 } 3379 const uint8_t *srcBlock= &(src[y*srcStride]);
3367 #endif 3380 uint8_t *dstBlock= tempDst + dstStride;
3368 3381
3369 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; 3382 // From this point on it is guaranteed that we can read and write 16 lines downward
3370 else if( (mode & LINEAR_BLEND_DEINT_FILTER) 3383 // finish 1 block before the next otherwise we might have a problem
3371 || (mode & FFMPEG_DEINT_FILTER) 3384 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3372 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; 3385 for(x=0; x<width; x+=BLOCK_SIZE){
3373 else if( (mode & V_DEBLOCK)
3374 || (mode & LINEAR_IPOL_DEINT_FILTER)
3375 || (mode & MEDIAN_DEINT_FILTER)
3376 || (mode & V_A_DEBLOCK)) copyAhead=13;
3377 else if(mode & V_X1_FILTER) copyAhead=11;
3378 // else if(mode & V_RK1_FILTER) copyAhead=10;
3379 else if(mode & DERING) copyAhead=9;
3380 else copyAhead=8;
3381
3382 copyAhead-= 8;
3383
3384 if(!isColor)
3385 {
3386 uint64_t sum= 0;
3387 int i;
3388 uint64_t maxClipped;
3389 uint64_t clipped;
3390 double scale;
3391
3392 c.frameNum++;
3393 // first frame is fscked so we ignore it
3394 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3395
3396 for(i=0; i<256; i++)
3397 {
3398 sum+= yHistogram[i];
3399 }
3400
3401 /* We always get a completely black picture first. */
3402 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3403
3404 clipped= sum;
3405 for(black=255; black>0; black--)
3406 {
3407 if(clipped < maxClipped) break;
3408 clipped-= yHistogram[black];
3409 }
3410
3411 clipped= sum;
3412 for(white=0; white<256; white++)
3413 {
3414 if(clipped < maxClipped) break;
3415 clipped-= yHistogram[white];
3416 }
3417
3418 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3419
3420 #ifdef HAVE_MMX2
3421 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3422 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3423 #else
3424 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3425 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3426 #endif
3427
3428 c.packedYOffset|= c.packedYOffset<<32;
3429 c.packedYOffset|= c.packedYOffset<<16;
3430
3431 c.packedYScale|= c.packedYScale<<32;
3432 c.packedYScale|= c.packedYScale<<16;
3433
3434 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
3435 else QPCorrecture= 256*256;
3436 }
3437 else
3438 {
3439 c.packedYScale= 0x0100010001000100LL;
3440 c.packedYOffset= 0;
3441 QPCorrecture= 256*256;
3442 }
3443
3444 /* copy & deinterlace first row of blocks */
3445 y=-BLOCK_SIZE;
3446 {
3447 const uint8_t *srcBlock= &(src[y*srcStride]);
3448 uint8_t *dstBlock= tempDst + dstStride;
3449
3450 // From this point on it is guaranteed that we can read and write 16 lines downward
3451 // finish 1 block before the next otherwise we might have a problem
3452 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3453 for(x=0; x<width; x+=BLOCK_SIZE)
3454 {
3455 3386
3456 #ifdef HAVE_MMX2 3387 #ifdef HAVE_MMX2
3457 /* 3388 /*
3458 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3389 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3459 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3390 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3460 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3391 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3461 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3392 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3462 */ 3393 */
3463 3394
3464 asm( 3395 asm(
3465 "mov %4, %%"REG_a" \n\t" 3396 "mov %4, %%"REG_a" \n\t"
3466 "shr $2, %%"REG_a" \n\t" 3397 "shr $2, %%"REG_a" \n\t"
3467 "and $6, %%"REG_a" \n\t" 3398 "and $6, %%"REG_a" \n\t"
3468 "add %5, %%"REG_a" \n\t" 3399 "add %5, %%"REG_a" \n\t"
3469 "mov %%"REG_a", %%"REG_d" \n\t" 3400 "mov %%"REG_a", %%"REG_d" \n\t"
3470 "imul %1, %%"REG_a" \n\t" 3401 "imul %1, %%"REG_a" \n\t"
3471 "imul %3, %%"REG_d" \n\t" 3402 "imul %3, %%"REG_d" \n\t"
3472 "prefetchnta 32(%%"REG_a", %0) \n\t" 3403 "prefetchnta 32(%%"REG_a", %0) \n\t"
3473 "prefetcht0 32(%%"REG_d", %2) \n\t" 3404 "prefetcht0 32(%%"REG_d", %2) \n\t"
3474 "add %1, %%"REG_a" \n\t" 3405 "add %1, %%"REG_a" \n\t"
3475 "add %3, %%"REG_d" \n\t" 3406 "add %3, %%"REG_d" \n\t"
3476 "prefetchnta 32(%%"REG_a", %0) \n\t" 3407 "prefetchnta 32(%%"REG_a", %0) \n\t"
3477 "prefetcht0 32(%%"REG_d", %2) \n\t" 3408 "prefetcht0 32(%%"REG_d", %2) \n\t"
3478 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), 3409 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3479 "g" ((long)x), "g" ((long)copyAhead) 3410 "g" ((long)x), "g" ((long)copyAhead)
3480 : "%"REG_a, "%"REG_d 3411 : "%"REG_a, "%"REG_d
3481 ); 3412 );
3482 3413
3483 #elif defined(HAVE_3DNOW) 3414 #elif defined(HAVE_3DNOW)
3484 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3485 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3486 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3487 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3488 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3489 */ 3420 */
3490 #endif 3421 #endif
3491 3422
3492 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, 3423 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3493 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3424 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3494 3425
3495 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); 3426 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3496 3427
3497 if(mode & LINEAR_IPOL_DEINT_FILTER) 3428 if(mode & LINEAR_IPOL_DEINT_FILTER)
3498 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3429 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3499 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3430 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3500 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3431 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3501 else if(mode & MEDIAN_DEINT_FILTER) 3432 else if(mode & MEDIAN_DEINT_FILTER)
3502 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3433 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3503 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3434 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3504 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3435 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3505 else if(mode & FFMPEG_DEINT_FILTER) 3436 else if(mode & FFMPEG_DEINT_FILTER)
3506 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3437 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3507 else if(mode & LOWPASS5_DEINT_FILTER) 3438 else if(mode & LOWPASS5_DEINT_FILTER)
3508 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3439 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3509 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3440 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3510 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3441 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3511 */ 3442 */
3512 dstBlock+=8; 3443 dstBlock+=8;
3513 srcBlock+=8; 3444 srcBlock+=8;
3514 }
3515 if(width==FFABS(dstStride))
3516 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3517 else
3518 {
3519 int i;
3520 for(i=0; i<copyAhead; i++)
3521 {
3522 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3523 }
3524 }
3525 } 3445 }
3526 3446 if(width==FFABS(dstStride))
3527 for(y=0; y<height; y+=BLOCK_SIZE) 3447 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3528 { 3448 else{
3529 //1% speedup if these are here instead of the inner loop 3449 int i;
3530 const uint8_t *srcBlock= &(src[y*srcStride]); 3450 for(i=0; i<copyAhead; i++){
3531 uint8_t *dstBlock= &(dst[y*dstStride]); 3451 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3452 }
3453 }
3454 }
3455
3456 for(y=0; y<height; y+=BLOCK_SIZE){
3457 //1% speedup if these are here instead of the inner loop
3458 const uint8_t *srcBlock= &(src[y*srcStride]);
3459 uint8_t *dstBlock= &(dst[y*dstStride]);
3532 #ifdef HAVE_MMX 3460 #ifdef HAVE_MMX
3533 uint8_t *tempBlock1= c.tempBlocks; 3461 uint8_t *tempBlock1= c.tempBlocks;
3534 uint8_t *tempBlock2= c.tempBlocks + 8; 3462 uint8_t *tempBlock2= c.tempBlocks + 8;
3535 #endif 3463 #endif
3536 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; 3464 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3537 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; 3465 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3538 int QP=0; 3466 int QP=0;
3539 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 3467 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3540 if not than use a temporary buffer */ 3468 if not than use a temporary buffer */
3541 if(y+15 >= height) 3469 if(y+15 >= height){
3542 { 3470 int i;
3543 int i; 3471 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3544 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with 3472 blockcopy to dst later */
3545 blockcopy to dst later */ 3473 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3546 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, 3474 FFMAX(height-y-copyAhead, 0), srcStride);
3547 FFMAX(height-y-copyAhead, 0), srcStride); 3475
3548 3476 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3549 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ 3477 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3550 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++) 3478 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3551 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride)); 3479
3552 3480 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3553 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ 3481 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3554 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride); 3482
3555 3483 /* duplicate last line of dst to fill the void upto line (copyAhead) */
3556 /* duplicate last line of dst to fill the void upto line (copyAhead) */ 3484 for(i=height-y+1; i<=copyAhead; i++)
3557 for(i=height-y+1; i<=copyAhead; i++) 3485 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3558 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride)); 3486
3559 3487 dstBlock= tempDst + dstStride;
3560 dstBlock= tempDst + dstStride; 3488 srcBlock= tempSrc;
3561 srcBlock= tempSrc; 3489 }
3562 } 3490
3563 3491 // From this point on it is guaranteed that we can read and write 16 lines downward
3564 // From this point on it is guaranteed that we can read and write 16 lines downward 3492 // finish 1 block before the next otherwise we might have a problem
3565 // finish 1 block before the next otherwise we might have a problem 3493 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3566 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3494 for(x=0; x<width; x+=BLOCK_SIZE){
3567 for(x=0; x<width; x+=BLOCK_SIZE) 3495 const int stride= dstStride;
3568 {
3569 const int stride= dstStride;
3570 #ifdef HAVE_MMX 3496 #ifdef HAVE_MMX
3571 uint8_t *tmpXchg; 3497 uint8_t *tmpXchg;
3572 #endif 3498 #endif
3573 if(isColor) 3499 if(isColor){
3574 { 3500 QP= QPptr[x>>qpHShift];
3575 QP= QPptr[x>>qpHShift]; 3501 c.nonBQP= nonBQPptr[x>>qpHShift];
3576 c.nonBQP= nonBQPptr[x>>qpHShift]; 3502 }else{
3577 } 3503 QP= QPptr[x>>4];
3578 else 3504 QP= (QP* QPCorrecture + 256*128)>>16;
3579 { 3505 c.nonBQP= nonBQPptr[x>>4];
3580 QP= QPptr[x>>4]; 3506 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3581 QP= (QP* QPCorrecture + 256*128)>>16; 3507 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3582 c.nonBQP= nonBQPptr[x>>4]; 3508 }
3583 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; 3509 c.QP= QP;
3584 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3585 }
3586 c.QP= QP;
3587 #ifdef HAVE_MMX 3510 #ifdef HAVE_MMX
3588 asm volatile( 3511 asm volatile(
3589 "movd %1, %%mm7 \n\t" 3512 "movd %1, %%mm7 \n\t"
3590 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 3513 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3591 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 3514 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3592 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 3515 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3593 "movq %%mm7, %0 \n\t" 3516 "movq %%mm7, %0 \n\t"
3594 : "=m" (c.pQPb) 3517 : "=m" (c.pQPb)
3595 : "r" (QP) 3518 : "r" (QP)
3596 ); 3519 );
3597 #endif 3520 #endif
3598 3521
3599 3522
3600 #ifdef HAVE_MMX2 3523 #ifdef HAVE_MMX2
3601 /* 3524 /*
3602 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3525 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3603 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3526 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3604 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3527 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3605 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3528 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3606 */ 3529 */
3607 3530
3608 asm( 3531 asm(
3609 "mov %4, %%"REG_a" \n\t" 3532 "mov %4, %%"REG_a" \n\t"
3610 "shr $2, %%"REG_a" \n\t" 3533 "shr $2, %%"REG_a" \n\t"
3611 "and $6, %%"REG_a" \n\t" 3534 "and $6, %%"REG_a" \n\t"
3612 "add %5, %%"REG_a" \n\t" 3535 "add %5, %%"REG_a" \n\t"
3613 "mov %%"REG_a", %%"REG_d" \n\t" 3536 "mov %%"REG_a", %%"REG_d" \n\t"
3614 "imul %1, %%"REG_a" \n\t" 3537 "imul %1, %%"REG_a" \n\t"
3615 "imul %3, %%"REG_d" \n\t" 3538 "imul %3, %%"REG_d" \n\t"
3616 "prefetchnta 32(%%"REG_a", %0) \n\t" 3539 "prefetchnta 32(%%"REG_a", %0) \n\t"
3617 "prefetcht0 32(%%"REG_d", %2) \n\t" 3540 "prefetcht0 32(%%"REG_d", %2) \n\t"
3618 "add %1, %%"REG_a" \n\t" 3541 "add %1, %%"REG_a" \n\t"
3619 "add %3, %%"REG_d" \n\t" 3542 "add %3, %%"REG_d" \n\t"
3620 "prefetchnta 32(%%"REG_a", %0) \n\t" 3543 "prefetchnta 32(%%"REG_a", %0) \n\t"
3621 "prefetcht0 32(%%"REG_d", %2) \n\t" 3544 "prefetcht0 32(%%"REG_d", %2) \n\t"
3622 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), 3545 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3623 "g" ((long)x), "g" ((long)copyAhead) 3546 "g" ((long)x), "g" ((long)copyAhead)
3624 : "%"REG_a, "%"REG_d 3547 : "%"REG_a, "%"REG_d
3625 ); 3548 );
3626 3549
3627 #elif defined(HAVE_3DNOW) 3550 #elif defined(HAVE_3DNOW)
3628 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3629 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3630 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3631 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3632 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3633 */ 3556 */
3634 #endif 3557 #endif
3635 3558
3636 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, 3559 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3637 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 3560 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3638 3561
3639 if(mode & LINEAR_IPOL_DEINT_FILTER) 3562 if(mode & LINEAR_IPOL_DEINT_FILTER)
3640 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 3563 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3641 else if(mode & LINEAR_BLEND_DEINT_FILTER) 3564 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3642 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 3565 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3643 else if(mode & MEDIAN_DEINT_FILTER) 3566 else if(mode & MEDIAN_DEINT_FILTER)
3644 RENAME(deInterlaceMedian)(dstBlock, dstStride); 3567 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3645 else if(mode & CUBIC_IPOL_DEINT_FILTER) 3568 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3646 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 3569 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3647 else if(mode & FFMPEG_DEINT_FILTER) 3570 else if(mode & FFMPEG_DEINT_FILTER)
3648 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 3571 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3649 else if(mode & LOWPASS5_DEINT_FILTER) 3572 else if(mode & LOWPASS5_DEINT_FILTER)
3650 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 3573 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3651 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 3574 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3652 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 3575 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3653 */ 3576 */
3654 3577
3655 /* only deblock if we have 2 blocks */ 3578 /* only deblock if we have 2 blocks */
3656 if(y + 8 < height) 3579 if(y + 8 < height){
3657 { 3580 if(mode & V_X1_FILTER)
3658 if(mode & V_X1_FILTER) 3581 RENAME(vertX1Filter)(dstBlock, stride, &c);
3659 RENAME(vertX1Filter)(dstBlock, stride, &c); 3582 else if(mode & V_DEBLOCK){
3660 else if(mode & V_DEBLOCK) 3583 const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3661 { 3584
3662 const int t= RENAME(vertClassify)(dstBlock, stride, &c); 3585 if(t==1)
3663 3586 RENAME(doVertLowPass)(dstBlock, stride, &c);
3664 if(t==1) 3587 else if(t==2)
3665 RENAME(doVertLowPass)(dstBlock, stride, &c); 3588 RENAME(doVertDefFilter)(dstBlock, stride, &c);
3666 else if(t==2) 3589 }else if(mode & V_A_DEBLOCK){
3667 RENAME(doVertDefFilter)(dstBlock, stride, &c); 3590 RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3668 }else if(mode & V_A_DEBLOCK){ 3591 }
3669 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); 3592 }
3670 }
3671 }
3672 3593
3673 #ifdef HAVE_MMX 3594 #ifdef HAVE_MMX
3674 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 3595 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3675 #endif 3596 #endif
3676 /* check if we have a previous block to deblock it with dstBlock */ 3597 /* check if we have a previous block to deblock it with dstBlock */
3677 if(x - 8 >= 0) 3598 if(x - 8 >= 0){
3678 {
3679 #ifdef HAVE_MMX 3599 #ifdef HAVE_MMX
3680 if(mode & H_X1_FILTER) 3600 if(mode & H_X1_FILTER)
3681 RENAME(vertX1Filter)(tempBlock1, 16, &c); 3601 RENAME(vertX1Filter)(tempBlock1, 16, &c);
3682 else if(mode & H_DEBLOCK) 3602 else if(mode & H_DEBLOCK){
3683 {
3684 //START_TIMER 3603 //START_TIMER
3685 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); 3604 const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3686 //STOP_TIMER("dc & minmax") 3605 //STOP_TIMER("dc & minmax")
3687 if(t==1) 3606 if(t==1)
3688 RENAME(doVertLowPass)(tempBlock1, 16, &c); 3607 RENAME(doVertLowPass)(tempBlock1, 16, &c);
3689 else if(t==2) 3608 else if(t==2)
3690 RENAME(doVertDefFilter)(tempBlock1, 16, &c); 3609 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3691 }else if(mode & H_A_DEBLOCK){ 3610 }else if(mode & H_A_DEBLOCK){
3692 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); 3611 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3693 } 3612 }
3694 3613
3695 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); 3614 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3696 3615
3697 #else 3616 #else
3698 if(mode & H_X1_FILTER) 3617 if(mode & H_X1_FILTER)
3699 horizX1Filter(dstBlock-4, stride, QP); 3618 horizX1Filter(dstBlock-4, stride, QP);
3700 else if(mode & H_DEBLOCK) 3619 else if(mode & H_DEBLOCK){
3701 {
3702 #ifdef HAVE_ALTIVEC 3620 #ifdef HAVE_ALTIVEC
3703 DECLARE_ALIGNED(16, unsigned char, tempBlock[272]); 3621 DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
3704 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); 3622 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3705 3623
3706 const int t=vertClassify_altivec(tempBlock-48, 16, &c); 3624 const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3707 if(t==1) { 3625 if(t==1) {
3708 doVertLowPass_altivec(tempBlock-48, 16, &c); 3626 doVertLowPass_altivec(tempBlock-48, 16, &c);
3709 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3627 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3710 } 3628 }
3711 else if(t==2) { 3629 else if(t==2) {
3712 doVertDefFilter_altivec(tempBlock-48, 16, &c); 3630 doVertDefFilter_altivec(tempBlock-48, 16, &c);
3713 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 3631 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3714 } 3632 }
3715 #else 3633 #else
3716 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); 3634 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3717 3635
3718 if(t==1) 3636 if(t==1)
3719 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); 3637 RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3720 else if(t==2) 3638 else if(t==2)
3721 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); 3639 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3722 #endif 3640 #endif
3723 }else if(mode & H_A_DEBLOCK){ 3641 }else if(mode & H_A_DEBLOCK){
3724 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); 3642 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3725 } 3643 }
3726 #endif //HAVE_MMX 3644 #endif //HAVE_MMX
3727 if(mode & DERING) 3645 if(mode & DERING){
3728 { 3646 //FIXME filter first line
3729 //FIXME filter first line 3647 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3730 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); 3648 }
3731 } 3649
3732 3650 if(mode & TEMP_NOISE_FILTER)
3733 if(mode & TEMP_NOISE_FILTER) 3651 {
3734 { 3652 RENAME(tempNoiseReducer)(dstBlock-8, stride,
3735 RENAME(tempNoiseReducer)(dstBlock-8, stride, 3653 c.tempBlured[isColor] + y*dstStride + x,
3736 c.tempBlured[isColor] + y*dstStride + x, 3654 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3737 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), 3655 c.ppMode.maxTmpNoise);
3738 c.ppMode.maxTmpNoise); 3656 }
3739 } 3657 }
3740 } 3658
3741 3659 dstBlock+=8;
3742 dstBlock+=8; 3660 srcBlock+=8;
3743 srcBlock+=8;
3744 3661
3745 #ifdef HAVE_MMX 3662 #ifdef HAVE_MMX
3746 tmpXchg= tempBlock1; 3663 tmpXchg= tempBlock1;
3747 tempBlock1= tempBlock2; 3664 tempBlock1= tempBlock2;
3748 tempBlock2 = tmpXchg; 3665 tempBlock2 = tmpXchg;
3749 #endif 3666 #endif
3667 }
3668
3669 if(mode & DERING){
3670 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3671 }
3672
3673 if((mode & TEMP_NOISE_FILTER)){
3674 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3675 c.tempBlured[isColor] + y*dstStride + x,
3676 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3677 c.ppMode.maxTmpNoise);
3678 }
3679
3680 /* did we use a tmp buffer for the last lines*/
3681 if(y+15 >= height){
3682 uint8_t *dstBlock= &(dst[y*dstStride]);
3683 if(width==FFABS(dstStride))
3684 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3685 else{
3686 int i;
3687 for(i=0; i<height-y; i++){
3688 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3750 } 3689 }
3751 3690 }
3752 if(mode & DERING) 3691 }
3753 {
3754 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3755 }
3756
3757 if((mode & TEMP_NOISE_FILTER))
3758 {
3759 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3760 c.tempBlured[isColor] + y*dstStride + x,
3761 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3762 c.ppMode.maxTmpNoise);
3763 }
3764
3765 /* did we use a tmp buffer for the last lines*/
3766 if(y+15 >= height)
3767 {
3768 uint8_t *dstBlock= &(dst[y*dstStride]);
3769 if(width==FFABS(dstStride))
3770 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3771 else
3772 {
3773 int i;
3774 for(i=0; i<height-y; i++)
3775 {
3776 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3777 }
3778 }
3779 }
3780 /* 3692 /*
3781 for(x=0; x<width; x+=32) 3693 for(x=0; x<width; x+=32){
3782 { 3694 volatile int i;
3783 volatile int i; 3695 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3784 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] 3696 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3785 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] 3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3786 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 3698 + dstBlock[x +13*dstStride]
3787 // + dstBlock[x +13*dstStride] 3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3788 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 3700 }*/
3789 }*/ 3701 }
3702 #ifdef HAVE_3DNOW
3703 asm volatile("femms");
3704 #elif defined (HAVE_MMX)
3705 asm volatile("emms");
3706 #endif
3707
3708 #ifdef DEBUG_BRIGHTNESS
3709 if(!isColor){
3710 int max=1;
3711 int i;
3712 for(i=0; i<256; i++)
3713 if(yHistogram[i] > max) max=yHistogram[i];
3714
3715 for(i=1; i<256; i++){
3716 int x;
3717 int start=yHistogram[i-1]/(max/256+1);
3718 int end=yHistogram[i]/(max/256+1);
3719 int inc= end > start ? 1 : -1;
3720 for(x=start; x!=end+inc; x+=inc)
3721 dst[ i*dstStride + x]+=128;
3790 } 3722 }
3791 #ifdef HAVE_3DNOW 3723
3792 asm volatile("femms"); 3724 for(i=0; i<100; i+=2){
3793 #elif defined (HAVE_MMX) 3725 dst[ (white)*dstStride + i]+=128;
3794 asm volatile("emms"); 3726 dst[ (black)*dstStride + i]+=128;
3727 }
3728 }
3795 #endif 3729 #endif
3796 3730
3797 #ifdef DEBUG_BRIGHTNESS 3731 *c2= c; //copy local context back
3798 if(!isColor)
3799 {
3800 int max=1;
3801 int i;
3802 for(i=0; i<256; i++)
3803 if(yHistogram[i] > max) max=yHistogram[i];
3804
3805 for(i=1; i<256; i++)
3806 {
3807 int x;
3808 int start=yHistogram[i-1]/(max/256+1);
3809 int end=yHistogram[i]/(max/256+1);
3810 int inc= end > start ? 1 : -1;
3811 for(x=start; x!=end+inc; x+=inc)
3812 dst[ i*dstStride + x]+=128;
3813 }
3814
3815 for(i=0; i<100; i+=2)
3816 {
3817 dst[ (white)*dstStride + i]+=128;
3818 dst[ (black)*dstStride + i]+=128;
3819 }
3820
3821 }
3822 #endif
3823
3824 *c2= c; //copy local context back
3825 3732
3826 } 3733 }