comparison libpostproc/postprocess_template.c @ 95:8bce253b537c libavcodec

new postprocess code by Michael Niedermayer (michaelni@gmx.at)
author arpi
date Wed, 10 Oct 2001 22:13:27 +0000
parents
children 29ac11dc53d3
comparison
equal deleted inserted replaced
94:7e263a256a6f 95:8bce253b537c
1 /*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19 /*
20 C MMX MMX2
21 isVertDC Ec Ec
22 isVertMinMaxOk Ec Ec
23 doVertLowPass E e
24 doVertDefFilter Ec Ec Ec
25 isHorizDC Ec Ec
26 isHorizMinMaxOk a
27 doHorizLowPass E a
28 doHorizDefFilter E a
29 deRing
30
31 E = Exact implementation
32 e = allmost exact implementation
33 a = alternative / approximate impl
34 c = checked against the other implementations (-vo md5)
35 */
36
37 /*
38 TODO:
39 verify that everything workes as it should
40 reduce the time wasted on the mem transfer
41 implement dering
42 implement everything in C at least
43 figure range of QP out (assuming <256 for now)
44 unroll stuff if instructions depend too much on the prior one
45 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
46 move YScale thing to the end instead of fixing QP
47 ...
48
49 Notes:
50
51 */
52
53
54 #include <inttypes.h>
55 #include <stdio.h>
56 #include "../config.h"
57 #include "postprocess.h"
58 //#undef HAVE_MMX2
59 //#undef HAVE_MMX
60
61
62
63 static uint64_t packedYOffset= 0x0000000000000000LL;
64 static uint64_t packedYScale= 0x0100010001000100LL;
65 static uint64_t w05= 0x0005000500050005LL;
66 static uint64_t w20= 0x0020002000200020LL;
67 static uint64_t w1400= 0x1400140014001400LL;
68 static uint64_t bm00000001= 0x00000000000000FFLL;
69 static uint64_t bm00010000= 0x000000FF00000000LL;
70 static uint64_t bm00001000= 0x00000000FF000000LL;
71 static uint64_t bm10000000= 0xFF00000000000000LL;
72 static uint64_t bm10000001= 0xFF000000000000FFLL;
73 static uint64_t bm11000011= 0xFFFF00000000FFFFLL;
74 static uint64_t bm00011000= 0x000000FFFF000000LL;
75 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL;
76 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL;
77 static uint64_t b00= 0x0000000000000000LL;
78 static uint64_t b02= 0x0202020202020202LL;
79 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL;
80 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL;
81 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL;
82 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL;
83 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL;
84 static uint64_t temp0=0;
85 static uint64_t temp1=0;
86 static uint64_t temp2=0;
87 static uint64_t temp3=0;
88 static uint64_t temp4=0;
89 static uint64_t temp5=0;
90 static uint64_t pQPb=0;
91 static uint8_t tempBlock[16*16];
92
93 int hFlatnessThreshold= 56 - 16;
94 int vFlatnessThreshold= 56 - 16;
95
96 //amount of "black" u r willing to loose to get a brightness corrected picture
97 double maxClippedThreshold= 0.01;
98
99 int maxAllowedY=255;
100 //FIXME can never make a movieŽs black brighter (anyone needs that?)
101 int minAllowedY=0;
102
103
104 static inline long long rdtsc()
105 {
106 long long l;
107 asm volatile( "rdtsc\n\t"
108 : "=A" (l)
109 );
110 // printf("%d\n", int(l/1000));
111 return l;
112 }
113
114 static inline void prefetchnta(void *p)
115 {
116 asm volatile( "prefetchnta (%0)\n\t"
117 : : "r" (p)
118 );
119 }
120
121 static inline void prefetcht0(void *p)
122 {
123 asm volatile( "prefetcht0 (%0)\n\t"
124 : : "r" (p)
125 );
126 }
127
128 static inline void prefetcht1(void *p)
129 {
130 asm volatile( "prefetcht1 (%0)\n\t"
131 : : "r" (p)
132 );
133 }
134
135 static inline void prefetcht2(void *p)
136 {
137 asm volatile( "prefetcht2 (%0)\n\t"
138 : : "r" (p)
139 );
140 }
141
142 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
143 /**
144 * Check if the middle 8x8 Block in the given 8x10 block is flat
145 */
146 static inline bool isVertDC(uint8_t src[], int stride){
147 // return true;
148 int numEq= 0;
149 src+= stride; // src points to begin of the 8x8 Block
150 #ifdef HAVE_MMX
151 asm volatile(
152 // "int $3 \n\t"
153 "pushl %1\n\t"
154 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
155 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
156 "movq (%1), %%mm0 \n\t"
157 "addl %2, %1 \n\t"
158 "movq (%1), %%mm1 \n\t"
159 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
160 "paddb %%mm7, %%mm0 \n\t"
161 "pcmpgtb %%mm6, %%mm0 \n\t"
162
163 "addl %2, %1 \n\t"
164 "movq (%1), %%mm2 \n\t"
165 "psubb %%mm2, %%mm1 \n\t"
166 "paddb %%mm7, %%mm1 \n\t"
167 "pcmpgtb %%mm6, %%mm1 \n\t"
168 "paddb %%mm1, %%mm0 \n\t"
169
170 "addl %2, %1 \n\t"
171 "movq (%1), %%mm1 \n\t"
172 "psubb %%mm1, %%mm2 \n\t"
173 "paddb %%mm7, %%mm2 \n\t"
174 "pcmpgtb %%mm6, %%mm2 \n\t"
175 "paddb %%mm2, %%mm0 \n\t"
176
177 "addl %2, %1 \n\t"
178 "movq (%1), %%mm2 \n\t"
179 "psubb %%mm2, %%mm1 \n\t"
180 "paddb %%mm7, %%mm1 \n\t"
181 "pcmpgtb %%mm6, %%mm1 \n\t"
182 "paddb %%mm1, %%mm0 \n\t"
183
184 "addl %2, %1 \n\t"
185 "movq (%1), %%mm1 \n\t"
186 "psubb %%mm1, %%mm2 \n\t"
187 "paddb %%mm7, %%mm2 \n\t"
188 "pcmpgtb %%mm6, %%mm2 \n\t"
189 "paddb %%mm2, %%mm0 \n\t"
190
191 "addl %2, %1 \n\t"
192 "movq (%1), %%mm2 \n\t"
193 "psubb %%mm2, %%mm1 \n\t"
194 "paddb %%mm7, %%mm1 \n\t"
195 "pcmpgtb %%mm6, %%mm1 \n\t"
196 "paddb %%mm1, %%mm0 \n\t"
197
198 "addl %2, %1 \n\t"
199 "movq (%1), %%mm1 \n\t"
200 "psubb %%mm1, %%mm2 \n\t"
201 "paddb %%mm7, %%mm2 \n\t"
202 "pcmpgtb %%mm6, %%mm2 \n\t"
203 "paddb %%mm2, %%mm0 \n\t"
204
205 " \n\t"
206 "movq %%mm0, %%mm1 \n\t"
207 "psrlw $8, %%mm0 \n\t"
208 "paddb %%mm1, %%mm0 \n\t"
209 "movq %%mm0, %%mm1 \n\t"
210 "psrlq $16, %%mm0 \n\t"
211 "paddb %%mm1, %%mm0 \n\t"
212 "movq %%mm0, %%mm1 \n\t"
213 "psrlq $32, %%mm0 \n\t"
214 "paddb %%mm1, %%mm0 \n\t"
215 "popl %1\n\t"
216 "movd %%mm0, %0 \n\t"
217 : "=r" (numEq)
218 : "r" (src), "r" (stride)
219 );
220 // printf("%d\n", numEq);
221 numEq= (256 - (numEq & 0xFF)) &0xFF;
222
223 // int asmEq= numEq;
224 // numEq=0;
225 // uint8_t *temp= src;
226
227 #else
228 for(int y=0; y<BLOCK_SIZE-1; y++)
229 {
230 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
231 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
232 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
233 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
234 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
235 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
236 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
237 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
238 src+= stride;
239 }
240 #endif
241 /* if(abs(numEq - asmEq) > 0)
242 {
243 printf("\nasm:%d c:%d\n", asmEq, numEq);
244 for(int y=0; y<8; y++)
245 {
246 for(int x=0; x<8; x++)
247 {
248 printf("%d ", temp[x + y*stride]);
249 }
250 printf("\n");
251 }
252 }
253 */
254 return numEq > vFlatnessThreshold;
255 }
256
257 static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP)
258 {
259 #ifdef HAVE_MMX
260 int isOk;
261 asm volatile(
262 // "int $3 \n\t"
263 "movq (%1, %2), %%mm0 \n\t"
264 "movq (%1, %2, 8), %%mm1 \n\t"
265 "movq %%mm0, %%mm2 \n\t"
266 "psubusb %%mm1, %%mm0 \n\t"
267 "psubusb %%mm2, %%mm1 \n\t"
268 "por %%mm1, %%mm0 \n\t" // ABS Diff
269
270 "movq pQPb, %%mm7 \n\t" // QP,..., QP
271 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
272 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
273 "pcmpeqd b00, %%mm0 \n\t"
274 "psrlq $16, %%mm0 \n\t"
275 "pcmpeqd bFF, %%mm0 \n\t"
276 // "movd %%mm0, (%1, %2, 4)\n\t"
277 "movd %%mm0, %0 \n\t"
278 : "=r" (isOk)
279 : "r" (src), "r" (stride)
280 );
281 return isOk;
282 #else
283
284 int isOk2= true;
285 for(int x=0; x<BLOCK_SIZE; x++)
286 {
287 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false;
288 }
289 /* if(isOk && !isOk2 || !isOk && isOk2)
290 {
291 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP);
292 for(int y=0; y<9; y++)
293 {
294 for(int x=0; x<8; x++)
295 {
296 printf("%d ", src[x + y*stride]);
297 }
298 printf("\n");
299 }
300 } */
301
302 return isOk2;
303 #endif
304
305 }
306
307 /**
308 * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
309 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
310 */
311 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
312 {
313 // QP= 64;
314
315 #ifdef HAVE_MMX2
316 asm volatile( //"movv %0 %1 %2\n\t"
317 "pushl %0 \n\t"
318 "movq pQPb, %%mm0 \n\t" // QP,..., QP
319 // "movq bFF , %%mm0 \n\t" // QP,..., QP
320
321 "movq (%0), %%mm6 \n\t"
322 "movq (%0, %1), %%mm5 \n\t"
323 "movq %%mm5, %%mm1 \n\t"
324 "movq %%mm6, %%mm2 \n\t"
325 "psubusb %%mm6, %%mm5 \n\t"
326 "psubusb %%mm1, %%mm2 \n\t"
327 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
328 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
329 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
330
331 "pand %%mm2, %%mm6 \n\t"
332 "pandn %%mm1, %%mm2 \n\t"
333 "por %%mm2, %%mm6 \n\t"// First Line to Filter
334
335 "movq (%0, %1, 8), %%mm5 \n\t"
336 "leal (%0, %1, 4), %%eax \n\t"
337 "leal (%0, %1, 8), %%ebx \n\t"
338 "subl %1, %%ebx \n\t"
339 "addl %1, %0 \n\t" // %0 points to line 1 not 0
340 "movq (%0, %1, 8), %%mm7 \n\t"
341 "movq %%mm5, %%mm1 \n\t"
342 "movq %%mm7, %%mm2 \n\t"
343 "psubusb %%mm7, %%mm5 \n\t"
344 "psubusb %%mm1, %%mm2 \n\t"
345 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
346 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
347 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF
348
349 "pand %%mm2, %%mm7 \n\t"
350 "pandn %%mm1, %%mm2 \n\t"
351 "por %%mm2, %%mm7 \n\t" // First Line to Filter
352
353
354 // 1 2 3 4 5 6 7 8
355 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1
356 // 6 4 2 2 1 1
357 // 6 4 4 2
358 // 6 8 2
359 /*
360 "movq %%mm6, %%mm2 \n\t" //1
361 "movq %%mm6, %%mm3 \n\t" //1
362 "paddusb b02, %%mm3 \n\t"
363 "psrlw $2, %%mm3 \n\t" //1 /4
364 "pand b3F, %%mm3 \n\t"
365 "psubb %%mm3, %%mm2 \n\t"
366 "movq (%0, %1), %%mm0 \n\t" // 1
367 "movq %%mm0, %%mm1 \n\t" // 1
368 "paddusb b02, %%mm0 \n\t"
369 "psrlw $2, %%mm0 \n\t" // 1 /4
370 "pand b3F, %%mm0 \n\t"
371 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4
372 */
373 "movq (%0, %1), %%mm0 \n\t" // 1
374 "movq %%mm0, %%mm1 \n\t" // 1
375 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2
376 "pavgb %%mm6, %%mm0 \n\t" //3 1 /4
377
378 "movq (%0, %1, 4), %%mm2 \n\t" // 1
379 "movq %%mm2, %%mm5 \n\t" // 1
380 "pavgb (%%eax), %%mm2 \n\t" // 11 /2
381 "pavgb (%0, %1, 2), %%mm2 \n\t" // 211 /4
382 "movq %%mm2, %%mm3 \n\t" // 211 /4
383 "movq (%0), %%mm4 \n\t" // 1
384 "pavgb %%mm4, %%mm3 \n\t" // 4 211 /8
385 "pavgb %%mm0, %%mm3 \n\t" //642211 /16
386 "movq %%mm3, (%0) \n\t" // X
387 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
388 "movq %%mm1, %%mm0 \n\t" // 1
389 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2
390 "movq %%mm4, %%mm3 \n\t" // 1
391 "pavgb (%0,%1,2), %%mm3 \n\t" // 1 1 /2
392 "pavgb (%%eax,%1,2), %%mm5 \n\t" // 11 /2
393 "pavgb (%%eax), %%mm5 \n\t" // 211 /4
394 "pavgb %%mm5, %%mm3 \n\t" // 2 2211 /8
395 "pavgb %%mm0, %%mm3 \n\t" //4242211 /16
396 "movq %%mm3, (%0,%1) \n\t" // X
397 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
398 "pavgb %%mm4, %%mm6 \n\t" //11 /2
399 "movq (%%ebx), %%mm0 \n\t" // 1
400 "pavgb (%%eax, %1, 2), %%mm0 \n\t" // 11/2
401 "movq %%mm0, %%mm3 \n\t" // 11/2
402 "pavgb %%mm1, %%mm0 \n\t" // 2 11/4
403 "pavgb %%mm6, %%mm0 \n\t" //222 11/8
404 "pavgb %%mm2, %%mm0 \n\t" //22242211/16
405 "movq (%0, %1, 2), %%mm2 \n\t" // 1
406 "movq %%mm0, (%0, %1, 2) \n\t" // X
407 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
408 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
409 "pavgb (%%ebx), %%mm0 \n\t" // 11 /2
410 "pavgb %%mm0, %%mm6 \n\t" //11 11 /4
411 "pavgb %%mm1, %%mm4 \n\t" // 11 /2
412 "pavgb %%mm2, %%mm1 \n\t" // 11 /2
413 "pavgb %%mm1, %%mm6 \n\t" //1122 11 /8
414 "pavgb %%mm5, %%mm6 \n\t" //112242211 /16
415 "movq (%%eax), %%mm5 \n\t" // 1
416 "movq %%mm6, (%%eax) \n\t" // X
417 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
418 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
419 "pavgb %%mm7, %%mm6 \n\t" // 11 /2
420 "pavgb %%mm4, %%mm6 \n\t" // 11 11 /4
421 "pavgb %%mm3, %%mm6 \n\t" // 11 2211 /8
422 "pavgb %%mm5, %%mm2 \n\t" // 11 /2
423 "movq (%0, %1, 4), %%mm4 \n\t" // 1
424 "pavgb %%mm4, %%mm2 \n\t" // 112 /4
425 "pavgb %%mm2, %%mm6 \n\t" // 112242211 /16
426 "movq %%mm6, (%0, %1, 4) \n\t" // X
427 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
428 "pavgb %%mm7, %%mm1 \n\t" // 11 2 /4
429 "pavgb %%mm4, %%mm5 \n\t" // 11 /2
430 "pavgb %%mm5, %%mm0 \n\t" // 11 11 /4
431 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
432 "pavgb %%mm6, %%mm1 \n\t" // 11 4 2 /8
433 "pavgb %%mm0, %%mm1 \n\t" // 11224222 /16
434 // "pxor %%mm1, %%mm1 \n\t"
435 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
436 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
437 "pavgb (%%ebx), %%mm2 \n\t" // 112 4 /8
438 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
439 "pavgb %%mm0, %%mm6 \n\t" // 1 1 /2
440 "pavgb %%mm7, %%mm6 \n\t" // 1 12 /4
441 "pavgb %%mm2, %%mm6 \n\t" // 1122424 /4
442 // "pxor %%mm6, %%mm6 \n\t"
443 "movq %%mm6, (%%ebx) \n\t" // X
444 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
445 "pavgb %%mm7, %%mm5 \n\t" // 11 2 /4
446 "pavgb %%mm7, %%mm5 \n\t" // 11 6 /8
447
448 "pavgb %%mm3, %%mm0 \n\t" // 112 /4
449 "pavgb %%mm0, %%mm5 \n\t" // 112246 /16
450 // "pxor %%mm5, %%mm5 \n\t"
451 // "movq pQPb, %%mm5 \n\t"
452 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
453 "popl %0\n\t"
454
455 :
456 : "r" (src), "r" (stride)
457 : "%eax", "%ebx"
458 );
459
460 #else
461 const int l1= stride;
462 const int l2= stride + l1;
463 const int l3= stride + l2;
464 const int l4= stride + l3;
465 const int l5= stride + l4;
466 const int l6= stride + l5;
467 const int l7= stride + l6;
468 const int l8= stride + l7;
469 const int l9= stride + l8;
470
471 for(int x=0; x<BLOCK_SIZE; x++)
472 {
473 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
474 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
475
476 int sums[9];
477 sums[0] = first + src[l1];
478 sums[1] = src[l1] + src[l2];
479 sums[2] = src[l2] + src[l3];
480 sums[3] = src[l3] + src[l4];
481 sums[4] = src[l4] + src[l5];
482 sums[5] = src[l5] + src[l6];
483 sums[6] = src[l6] + src[l7];
484 sums[7] = src[l7] + src[l8];
485 sums[8] = src[l8] + last;
486
487 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
488 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
489 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
490 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
491 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
492 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
493 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
494 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
495
496 src++;
497 }
498
499 #endif
500 }
501
502 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
503 {
504 #ifdef HAVE_MMX
505 src+= stride;
506 //FIXME try pmul for *5 stuff
507 // src[0]=0;
508 asm volatile(
509 "pxor %%mm7, %%mm7 \n\t"
510 "leal (%0, %1), %%eax \n\t"
511 "leal (%%eax, %1, 4), %%ebx \n\t"
512 // 0 1 2 3 4 5 6 7
513 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1
514 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1
515
516 "movq (%0), %%mm0 \n\t"
517 "movq %%mm0, %%mm1 \n\t"
518 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
519 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
520
521 "movq (%%eax), %%mm2 \n\t"
522 "movq %%mm2, %%mm3 \n\t"
523 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
524 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
525
526 "movq (%%eax, %1), %%mm4 \n\t"
527 "movq %%mm4, %%mm5 \n\t"
528 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
529 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
530
531 "paddw %%mm0, %%mm0 \n\t" // 2L0
532 "paddw %%mm1, %%mm1 \n\t" // 2H0
533 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
534 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
535 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
536 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
537
538 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
539 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
540 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
541 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
542
543 "movq (%%eax, %1, 2), %%mm2 \n\t"
544 "movq %%mm2, %%mm3 \n\t"
545 "punpcklbw %%mm7, %%mm2 \n\t" // L3
546 "punpckhbw %%mm7, %%mm3 \n\t" // H3
547
548 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
549 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
550 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
551 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
552 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
553 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
554
555 "movq (%0, %1, 4), %%mm0 \n\t"
556 "movq %%mm0, %%mm1 \n\t"
557 "punpcklbw %%mm7, %%mm0 \n\t" // L4
558 "punpckhbw %%mm7, %%mm1 \n\t" // H4
559
560 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
561 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
562 "movq %%mm2, temp2 \n\t" // L3 - L4
563 "movq %%mm3, temp3 \n\t" // H3 - H4
564 "paddw %%mm4, %%mm4 \n\t" // 2L2
565 "paddw %%mm5, %%mm5 \n\t" // 2H2
566 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
567 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
568
569 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
570 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
571 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
572 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
573 //50 opcodes so far
574 "movq (%%ebx), %%mm2 \n\t"
575 "movq %%mm2, %%mm3 \n\t"
576 "punpcklbw %%mm7, %%mm2 \n\t" // L5
577 "punpckhbw %%mm7, %%mm3 \n\t" // H5
578 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
579 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
580 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
581 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
582
583 "movq (%%ebx, %1), %%mm6 \n\t"
584 "punpcklbw %%mm7, %%mm6 \n\t" // L6
585 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
586 "movq (%%ebx, %1), %%mm6 \n\t"
587 "punpckhbw %%mm7, %%mm6 \n\t" // H6
588 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
589
590 "paddw %%mm0, %%mm0 \n\t" // 2L4
591 "paddw %%mm1, %%mm1 \n\t" // 2H4
592 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
593 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
594
595 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
596 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
597 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
598 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
599
600 "movq (%%ebx, %1, 2), %%mm2 \n\t"
601 "movq %%mm2, %%mm3 \n\t"
602 "punpcklbw %%mm7, %%mm2 \n\t" // L7
603 "punpckhbw %%mm7, %%mm3 \n\t" // H7
604
605 "paddw %%mm2, %%mm2 \n\t" // 2L7
606 "paddw %%mm3, %%mm3 \n\t" // 2H7
607 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
608 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
609
610 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
611 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
612 //FIXME pxor, psubw, pmax for abs
613 "movq %%mm7, %%mm6 \n\t" // 0
614 "pcmpgtw %%mm0, %%mm6 \n\t"
615 "pxor %%mm6, %%mm0 \n\t"
616 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
617 "movq %%mm7, %%mm6 \n\t" // 0
618 "pcmpgtw %%mm1, %%mm6 \n\t"
619 "pxor %%mm6, %%mm1 \n\t"
620 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
621
622 "movq %%mm7, %%mm6 \n\t" // 0
623 "pcmpgtw %%mm2, %%mm6 \n\t"
624 "pxor %%mm6, %%mm2 \n\t"
625 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
626 "movq %%mm7, %%mm6 \n\t" // 0
627 "pcmpgtw %%mm3, %%mm6 \n\t"
628 "pxor %%mm6, %%mm3 \n\t"
629 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
630
631 #ifdef HAVE_MMX2
632 "pminsw %%mm2, %%mm0 \n\t"
633 "pminsw %%mm3, %%mm1 \n\t"
634 #else
635 "movq %%mm0, %%mm6 \n\t"
636 "psubusw %%mm2, %%mm6 \n\t"
637 "psubw %%mm6, %%mm0 \n\t"
638 "movq %%mm1, %%mm6 \n\t"
639 "psubusw %%mm3, %%mm6 \n\t"
640 "psubw %%mm6, %%mm1 \n\t"
641 #endif
642
643 "movq %%mm7, %%mm6 \n\t" // 0
644 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
645 "pxor %%mm6, %%mm4 \n\t"
646 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
647 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
648 "pxor %%mm7, %%mm5 \n\t"
649 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
650 // 100 opcodes
651 "movd %2, %%mm2 \n\t" // QP
652 //"pcmpeqb %%mm2, %%mm2\n\t"
653 "punpcklwd %%mm2, %%mm2 \n\t"
654 "punpcklwd %%mm2, %%mm2 \n\t"
655 "psllw $3, %%mm2 \n\t" // 8QP
656 "movq %%mm2, %%mm3 \n\t" // 8QP
657 "pcmpgtw %%mm4, %%mm2 \n\t"
658 "pcmpgtw %%mm5, %%mm3 \n\t"
659 "pand %%mm2, %%mm4 \n\t"
660 "pand %%mm3, %%mm5 \n\t"
661
662
663 "psubusw %%mm0, %%mm4 \n\t" // hd
664 "psubusw %%mm1, %%mm5 \n\t" // ld
665
666
667 "movq w05, %%mm2 \n\t" // 5
668 "pmullw %%mm2, %%mm4 \n\t"
669 "pmullw %%mm2, %%mm5 \n\t"
670 "movq w20, %%mm2 \n\t" // 32
671 "paddw %%mm2, %%mm4 \n\t"
672 "paddw %%mm2, %%mm5 \n\t"
673 "psrlw $6, %%mm4 \n\t"
674 "psrlw $6, %%mm5 \n\t"
675
676 /*
677 "movq w06, %%mm2 \n\t" // 6
678 "paddw %%mm2, %%mm4 \n\t"
679 "paddw %%mm2, %%mm5 \n\t"
680 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16
681 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
682 "pmulhw %%mm2, %%mm4 \n\t" // hd/13
683 "pmulhw %%mm2, %%mm5 \n\t" // ld/13
684 */
685
686 "movq temp2, %%mm0 \n\t" // L3 - L4
687 "movq temp3, %%mm1 \n\t" // H3 - H4
688
689 "pxor %%mm2, %%mm2 \n\t"
690 "pxor %%mm3, %%mm3 \n\t"
691
692 // FIXME rounding error
693 "psraw $1, %%mm0 \n\t" // (L3 - L4)/2
694 "psraw $1, %%mm1 \n\t" // (H3 - H4)/2
695 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
696 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
697 "pxor %%mm2, %%mm0 \n\t"
698 "pxor %%mm3, %%mm1 \n\t"
699 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
700 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
701 // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
702 // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
703
704 "pxor %%mm6, %%mm2 \n\t"
705 "pxor %%mm7, %%mm3 \n\t"
706 "pand %%mm2, %%mm4 \n\t"
707 "pand %%mm3, %%mm5 \n\t"
708
709 #ifdef HAVE_MMX2
710 "pminsw %%mm0, %%mm4 \n\t"
711 "pminsw %%mm1, %%mm5 \n\t"
712 #else
713 "movq %%mm4, %%mm2 \n\t"
714 "psubusw %%mm0, %%mm2 \n\t"
715 "psubw %%mm2, %%mm4 \n\t"
716 "movq %%mm5, %%mm2 \n\t"
717 "psubusw %%mm1, %%mm2 \n\t"
718 "psubw %%mm2, %%mm5 \n\t"
719 #endif
720 "pxor %%mm6, %%mm4 \n\t"
721 "pxor %%mm7, %%mm5 \n\t"
722 "psubw %%mm6, %%mm4 \n\t"
723 "psubw %%mm7, %%mm5 \n\t"
724 "packsswb %%mm5, %%mm4 \n\t"
725 "movq (%%eax, %1, 2), %%mm0 \n\t"
726 "paddb %%mm4, %%mm0 \n\t"
727 "movq %%mm0, (%%eax, %1, 2) \n\t"
728 "movq (%0, %1, 4), %%mm0 \n\t"
729 "psubb %%mm4, %%mm0 \n\t"
730 // "pxor %%mm0, %%mm0 \n\t"
731 "movq %%mm0, (%0, %1, 4) \n\t"
732
733 :
734 : "r" (src), "r" (stride), "r" (QP)
735 : "%eax", "%ebx"
736 );
737 #else
738 const int l1= stride;
739 const int l2= stride + l1;
740 const int l3= stride + l2;
741 const int l4= stride + l3;
742 const int l5= stride + l4;
743 const int l6= stride + l5;
744 const int l7= stride + l6;
745 const int l8= stride + l7;
746 // const int l9= stride + l8;
747
748 for(int x=0; x<BLOCK_SIZE; x++)
749 {
750 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
751 if(ABS(middleEnergy) < 8*QP)
752 {
753 const int q=(src[l4] - src[l5])/2;
754 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
755 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
756
757 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
758 d= MAX(d, 0);
759
760 d= (5*d + 32) >> 6;
761 d*= SIGN(-middleEnergy);
762
763 if(q>0)
764 {
765 d= d<0 ? 0 : d;
766 d= d>q ? q : d;
767 }
768 else
769 {
770 d= d>0 ? 0 : d;
771 d= d<q ? q : d;
772 }
773
774 src[l4]-= d;
775 src[l5]+= d;
776 }
777 src++;
778 }
779 #endif
780 }
781
782 //FIXME? |255-0| = 1
783 /**
784 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
785 */
786 static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride)
787 {
788 // src++;
789 int numEq= 0;
790 #ifdef HAVE_MMX
791 asm volatile (
792 // "int $3 \n\t"
793 "pushl %1\n\t"
794 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F
795 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D
796 "leal tempBlock, %%eax \n\t"
797 "pxor %%mm0, %%mm0 \n\t"
798
799 #define HDC_CHECK_AND_CPY(i) \
800 "movq -4(%1), %%mm2 \n\t"\
801 "psrlq $32, %%mm2 \n\t"\
802 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\
803 "movq %%mm2, %%mm1 \n\t"\
804 "psrlq $8, %%mm2 \n\t"\
805 "psubb %%mm1, %%mm2 \n\t"\
806 "paddb %%mm7, %%mm2 \n\t"\
807 "pcmpgtb %%mm6, %%mm2 \n\t"\
808 "paddb %%mm2, %%mm0 \n\t"\
809 "movq %%mm1," #i "(%%eax) \n\t"
810
811 HDC_CHECK_AND_CPY(0)
812 "addl %2, %1 \n\t"
813 HDC_CHECK_AND_CPY(8)
814 "addl %2, %1 \n\t"
815 HDC_CHECK_AND_CPY(16)
816 "addl %2, %1 \n\t"
817 HDC_CHECK_AND_CPY(24)
818 "addl %2, %1 \n\t"
819 HDC_CHECK_AND_CPY(32)
820 "addl %2, %1 \n\t"
821 HDC_CHECK_AND_CPY(40)
822 "addl %2, %1 \n\t"
823 HDC_CHECK_AND_CPY(48)
824 "addl %2, %1 \n\t"
825 HDC_CHECK_AND_CPY(56)
826
827 "psllq $8, %%mm0 \n\t" // remove dummy value
828 "movq %%mm0, %%mm1 \n\t"
829 "psrlw $8, %%mm0 \n\t"
830 "paddb %%mm1, %%mm0 \n\t"
831 "movq %%mm0, %%mm1 \n\t"
832 "psrlq $16, %%mm0 \n\t"
833 "paddb %%mm1, %%mm0 \n\t"
834 "movq %%mm0, %%mm1 \n\t"
835 "psrlq $32, %%mm0 \n\t"
836 "paddb %%mm1, %%mm0 \n\t"
837 "popl %1\n\t"
838 "movd %%mm0, %0 \n\t"
839 : "=r" (numEq)
840 : "r" (src), "r" (stride)
841 : "%eax"
842 );
843 // printf("%d\n", numEq);
844 numEq= (256 - (numEq & 0xFF)) &0xFF;
845 #else
846 for(int y=0; y<BLOCK_SIZE; y++)
847 {
848 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
849 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
850 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
851 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
852 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
853 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
854 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
855 tempBlock[0 + y*TEMP_STRIDE] = src[0];
856 tempBlock[1 + y*TEMP_STRIDE] = src[1];
857 tempBlock[2 + y*TEMP_STRIDE] = src[2];
858 tempBlock[3 + y*TEMP_STRIDE] = src[3];
859 tempBlock[4 + y*TEMP_STRIDE] = src[4];
860 tempBlock[5 + y*TEMP_STRIDE] = src[5];
861 tempBlock[6 + y*TEMP_STRIDE] = src[6];
862 tempBlock[7 + y*TEMP_STRIDE] = src[7];
863 src+= stride;
864 }
865 #endif
866 /* if(abs(numEq - asmEq) > 0)
867 {
868 // printf("\nasm:%d c:%d\n", asmEq, numEq);
869 for(int y=0; y<8; y++)
870 {
871 for(int x=0; x<8; x++)
872 {
873 printf("%d ", src[x + y*stride]);
874 }
875 printf("\n");
876 }
877 }
878 */
879 // printf("%d\n", numEq);
880 return numEq > hFlatnessThreshold;
881 }
882
883 static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP)
884 {
885 #ifdef MMX_FIXME
886 FIXME
887 int isOk;
888 asm volatile(
889 // "int $3 \n\t"
890 "movq (%1, %2), %%mm0 \n\t"
891 "movq (%1, %2, 8), %%mm1 \n\t"
892 "movq %%mm0, %%mm2 \n\t"
893 "psubusb %%mm1, %%mm0 \n\t"
894 "psubusb %%mm2, %%mm1 \n\t"
895 "por %%mm1, %%mm0 \n\t" // ABS Diff
896
897 "movq pQPb, %%mm7 \n\t" // QP,..., QP
898 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
899 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
900 "pcmpeqd b00, %%mm0 \n\t"
901 "psrlq $16, %%mm0 \n\t"
902 "pcmpeqd bFF, %%mm0 \n\t"
903 // "movd %%mm0, (%1, %2, 4)\n\t"
904 "movd %%mm0, %0 \n\t"
905 : "=r" (isOk)
906 : "r" (src), "r" (stride)
907 );
908 return isOk;
909 #else
910 if(abs(src[0] - src[7]) > 2*QP) return false;
911
912 return true;
913 #endif
914 }
915
916 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
917 {
918 #ifdef HAVE_MMX2
919 asm volatile(
920 "pushl %0 \n\t"
921 "pxor %%mm7, %%mm7 \n\t"
922 "movq bm00001000, %%mm6 \n\t"
923 "movd %2, %%mm5 \n\t" // QP
924 "movq %%mm5, %%mm4 \n\t"
925 "paddusb %%mm5, %%mm5 \n\t" // 2QP
926 "paddusb %%mm5, %%mm4 \n\t" // 3QP
927 "psllq $24, %%mm4 \n\t"
928 "pxor %%mm5, %%mm5 \n\t" // 0
929 "psubb %%mm4, %%mm5 \n\t" // -QP
930 "leal tempBlock, %%eax \n\t"
931
932 //FIXME? "unroll by 2" and mix
933 #define HDF(i) "movq " #i "(%%eax), %%mm0 \n\t"\
934 "movq %%mm0, %%mm1 \n\t"\
935 "movq %%mm0, %%mm2 \n\t"\
936 "psrlq $8, %%mm1 \n\t"\
937 "psubusb %%mm1, %%mm2 \n\t"\
938 "psubusb %%mm0, %%mm1 \n\t"\
939 "por %%mm2, %%mm1 \n\t" /* |px - p(x+1)| */\
940 "pcmpeqb %%mm7, %%mm2 \n\t" /* sgn[px - p(x+1)] */\
941 "pshufw $0xAA, %%mm1, %%mm3 \n\t"\
942 "pminub %%mm1, %%mm3 \n\t"\
943 "psrlq $16, %%mm3 \n\t"\
944 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
945 "paddb %%mm5, %%mm1 \n\t"\
946 "psubusb %%mm5, %%mm1 \n\t"\
947 "psrlw $2, %%mm1 \n\t"\
948 "pxor %%mm2, %%mm1 \n\t"\
949 "psubb %%mm2, %%mm1 \n\t"\
950 "pand %%mm6, %%mm1 \n\t"\
951 "psubb %%mm1, %%mm0 \n\t"\
952 "psllq $8, %%mm1 \n\t"\
953 "paddb %%mm1, %%mm0 \n\t"\
954 "movd %%mm0, (%0) \n\t"\
955 "psrlq $32, %%mm0 \n\t"\
956 "movd %%mm0, 4(%0) \n\t"
957
958 HDF(0)
959 "addl %1, %0 \n\t"
960 HDF(8)
961 "addl %1, %0 \n\t"
962 HDF(16)
963 "addl %1, %0 \n\t"
964 HDF(24)
965 "addl %1, %0 \n\t"
966 HDF(32)
967 "addl %1, %0 \n\t"
968 HDF(40)
969 "addl %1, %0 \n\t"
970 HDF(48)
971 "addl %1, %0 \n\t"
972 HDF(56)
973 "popl %0 \n\t"
974 :
975 : "r" (dst), "r" (stride), "r" (QP)
976 : "%eax"
977 );
978 #else
979 uint8_t *src= tempBlock;
980
981 for(int y=0; y<BLOCK_SIZE; y++)
982 {
983 dst[0] = src[0];
984 dst[1] = src[1];
985 dst[2] = src[2];
986 dst[3] = src[3];
987 dst[4] = src[4];
988 dst[5] = src[5];
989 dst[6] = src[6];
990 dst[7] = src[7];
991
992 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
993 if(ABS(middleEnergy) < 8*QP)
994 {
995 const int q=(src[3] - src[4])/2;
996 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]);
997 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
998
999 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1000 d= MAX(d, 0);
1001
1002 d= (5*d + 32) >> 6;
1003 d*= SIGN(-middleEnergy);
1004
1005 if(q>0)
1006 {
1007 d= d<0 ? 0 : d;
1008 d= d>q ? q : d;
1009 }
1010 else
1011 {
1012 d= d>0 ? 0 : d;
1013 d= d<q ? q : d;
1014 }
1015
1016 dst[3]-= d;
1017 dst[4]+= d;
1018 }
1019 dst+= stride;
1020 src+= TEMP_STRIDE;
1021 }
1022 #endif
1023 }
1024
1025 /**
1026 * Do a horizontal low pass filter on the 8x8 block
1027 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1028 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2 version)
1029 */
1030 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1031 {
1032 //return;
1033 #ifdef HAVE_MMX2
1034 asm volatile( //"movv %0 %1 %2\n\t"
1035 "pushl %0\n\t"
1036 "pxor %%mm7, %%mm7 \n\t"
1037 "leal tempBlock, %%eax \n\t"
1038
1039 #define HLP1 "movq (%0), %%mm0 \n\t"\
1040 "movq %%mm0, %%mm1 \n\t"\
1041 "psllq $8, %%mm0 \n\t"\
1042 "pavgb %%mm1, %%mm0 \n\t"\
1043 "psrlw $8, %%mm0 \n\t"\
1044 "pxor %%mm1, %%mm1 \n\t"\
1045 "packuswb %%mm1, %%mm0 \n\t"\
1046 "movq %%mm0, %%mm1 \n\t"\
1047 "movq %%mm0, %%mm2 \n\t"\
1048 "psllq $32, %%mm0 \n\t"\
1049 "paddb %%mm0, %%mm1 \n\t"\
1050 "psllq $16, %%mm2 \n\t"\
1051 "pavgb %%mm2, %%mm0 \n\t"\
1052 "movq %%mm0, %%mm3 \n\t"\
1053 "pand bm11001100, %%mm0 \n\t"\
1054 "paddusb %%mm0, %%mm3 \n\t"\
1055 "psrlq $8, %%mm3 \n\t"\
1056 "pavgb %%mm1, %%mm4 \n\t"\
1057 "pavgb %%mm3, %%mm2 \n\t"\
1058 "psrlq $16, %%mm2 \n\t"\
1059 "punpcklbw %%mm2, %%mm2 \n\t"\
1060 "movq %%mm2, (%0) \n\t"\
1061
1062 #define HLP2 "movq (%0), %%mm0 \n\t"\
1063 "movq %%mm0, %%mm1 \n\t"\
1064 "psllq $8, %%mm0 \n\t"\
1065 "pavgb %%mm1, %%mm0 \n\t"\
1066 "psrlw $8, %%mm0 \n\t"\
1067 "pxor %%mm1, %%mm1 \n\t"\
1068 "packuswb %%mm1, %%mm0 \n\t"\
1069 "movq %%mm0, %%mm2 \n\t"\
1070 "psllq $32, %%mm0 \n\t"\
1071 "psllq $16, %%mm2 \n\t"\
1072 "pavgb %%mm2, %%mm0 \n\t"\
1073 "movq %%mm0, %%mm3 \n\t"\
1074 "pand bm11001100, %%mm0 \n\t"\
1075 "paddusb %%mm0, %%mm3 \n\t"\
1076 "psrlq $8, %%mm3 \n\t"\
1077 "pavgb %%mm3, %%mm2 \n\t"\
1078 "psrlq $16, %%mm2 \n\t"\
1079 "punpcklbw %%mm2, %%mm2 \n\t"\
1080 "movq %%mm2, (%0) \n\t"\
1081
1082 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1083 /*
1084 31
1085 121
1086 121
1087 121
1088 121
1089 121
1090 121
1091 13
1092 Implemented Exact 7-Tap
1093 9421 A321
1094 36421 64321
1095 334321 =
1096 1234321 =
1097 1234321 =
1098 123433 =
1099 12463 12346
1100 1249 123A
1101
1102 */
1103 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\
1104 "movq %%mm0, %%mm1 \n\t"\
1105 "movq %%mm0, %%mm2 \n\t"\
1106 "movq %%mm0, %%mm3 \n\t"\
1107 "movq %%mm0, %%mm4 \n\t"\
1108 "psllq $8, %%mm1 \n\t"\
1109 "psrlq $8, %%mm2 \n\t"\
1110 "pand bm00000001, %%mm3 \n\t"\
1111 "pand bm10000000, %%mm4 \n\t"\
1112 "por %%mm3, %%mm1 \n\t"\
1113 "por %%mm4, %%mm2 \n\t"\
1114 "pavgb %%mm2, %%mm1 \n\t"\
1115 "pavgb %%mm1, %%mm0 \n\t"\
1116 \
1117 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\
1118 "pshufw $0x90, %%mm0, %%mm4 \n\t"\
1119 "pavgb %%mm3, %%mm4 \n\t"\
1120 "pavgb %%mm4, %%mm0 \n\t"\
1121 "movd %%mm0, (%0) \n\t"\
1122 "psrlq $32, %%mm0 \n\t"\
1123 "movd %%mm0, 4(%0) \n\t"\
1124
1125 #define HLP(i) HLP3(i)
1126
1127 HLP(0)
1128 "addl %1, %0 \n\t"
1129 HLP(8)
1130 "addl %1, %0 \n\t"
1131 HLP(16)
1132 "addl %1, %0 \n\t"
1133 HLP(24)
1134 "addl %1, %0 \n\t"
1135 HLP(32)
1136 "addl %1, %0 \n\t"
1137 HLP(40)
1138 "addl %1, %0 \n\t"
1139 HLP(48)
1140 "addl %1, %0 \n\t"
1141 HLP(56)
1142
1143 "popl %0\n\t"
1144 :
1145 : "r" (dst), "r" (stride)
1146 : "%eax", "%ebx"
1147 );
1148
1149 #else
1150 uint8_t *temp= tempBlock;
1151 for(int y=0; y<BLOCK_SIZE; y++)
1152 {
1153 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1154 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1155
1156 int sums[9];
1157 sums[0] = first + temp[0];
1158 sums[1] = temp[0] + temp[1];
1159 sums[2] = temp[1] + temp[2];
1160 sums[3] = temp[2] + temp[3];
1161 sums[4] = temp[3] + temp[4];
1162 sums[5] = temp[4] + temp[5];
1163 sums[6] = temp[5] + temp[6];
1164 sums[7] = temp[6] + temp[7];
1165 sums[8] = temp[7] + last;
1166
1167 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1168 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1169 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1170 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1171 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1172 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1173 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1174 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1175
1176 dst+= stride;
1177 temp+= TEMP_STRIDE;
1178 }
1179 #endif
1180 }
1181
1182
1183 static inline void dering(uint8_t src[], int stride, int QP)
1184 {
1185 //FIXME
1186
1187 #ifdef HAVE_MMX2X
1188 asm volatile(
1189 "leal (%0, %1), %%eax \n\t"
1190 "leal (%%eax, %1, 4), %%ebx \n\t"
1191 // 0 1 2 3 4 5 6 7 8 9
1192 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1193
1194 "pcmpeq %%mm6, %%mm6 \n\t"
1195 "pxor %%mm7, %%mm7 \n\t"
1196
1197 #define FIND_MIN_MAX(addr)\
1198 "movq (" #addr "), %%mm0, \n\t"\
1199 "pminub %%mm0, %%mm6 \n\t"\
1200 "pmaxub %%mm0, %%mm7 \n\t"
1201
1202 FIND_MIN_MAX(%0)
1203 FIND_MIN_MAX(%%eax)
1204 FIND_MIN_MAX(%%eax, %1)
1205 FIND_MIN_MAX(%%eax, %1, 2)
1206 FIND_MIN_MAX(%0, %1, 4)
1207 FIND_MIN_MAX(%%ebx)
1208 FIND_MIN_MAX(%%ebx, %1)
1209 FIND_MIN_MAX(%%ebx, %1, 2)
1210 FIND_MIN_MAX(%0, %1, 8)
1211 FIND_MIN_MAX(%%ebx, %1, 2)
1212
1213 "movq %%mm6, %%mm4 \n\t"
1214 "psrlq $32, %%mm6 \n\t"
1215 "pminub %%mm4, %%mm6 \n\t"
1216 "movq %%mm6, %%mm4 \n\t"
1217 "psrlq $16, %%mm6 \n\t"
1218 "pminub %%mm4, %%mm6 \n\t"
1219 "movq %%mm6, %%mm4 \n\t"
1220 "psrlq $8, %%mm6 \n\t"
1221 "pminub %%mm4, %%mm6 \n\t" // min of pixels
1222
1223 "movq %%mm7, %%mm4 \n\t"
1224 "psrlq $32, %%mm7 \n\t"
1225 "pmaxub %%mm4, %%mm7 \n\t"
1226 "movq %%mm7, %%mm4 \n\t"
1227 "psrlq $16, %%mm7 \n\t"
1228 "pmaxub %%mm4, %%mm7 \n\t"
1229 "movq %%mm7, %%mm4 \n\t"
1230 "psrlq $8, %%mm7 \n\t"
1231 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels
1232 "pavgb %%mm6, %%mm7 \n\t" // (max + min)/2
1233
1234
1235 : : "r" (src), "r" (stride), "r" (QP)
1236 : "%eax", "%ebx"
1237 );
1238 #else
1239
1240 //FIXME
1241 #endif
1242 }
1243
1244 /**
1245 * ...
1246 */
1247 extern "C"{
1248 void postprocess(unsigned char * src[], int src_stride,
1249 unsigned char * dst[], int dst_stride,
1250 int horizontal_size, int vertical_size,
1251 QP_STORE_T *QP_store, int QP_stride,
1252 int mode)
1253 {
1254 /*
1255 long long T= rdtsc();
1256 for(int y=vertical_size-1; y>=0 ; y--)
1257 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
1258 // memcpy(dst[0], src[0],src_stride*vertical_size);
1259 printf("%4dk\r", (rdtsc()-T)/1000);
1260
1261 return;
1262 */
1263 /*
1264 long long T= rdtsc();
1265 while( (rdtsc() - T)/1000 < 4000);
1266
1267 return;
1268 */
1269 postProcess(src[0], src_stride,
1270 dst[0], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, false);
1271
1272 horizontal_size >>= 1;
1273 vertical_size >>= 1;
1274 src_stride >>= 1;
1275 dst_stride >>= 1;
1276
1277 if(1)
1278 {
1279 postProcess(src[1], src_stride,
1280 dst[1], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true);
1281 postProcess(src[2], src_stride,
1282 dst[2], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true);
1283 }
1284 else
1285 {
1286 memcpy(dst[1], src[1], src_stride*horizontal_size);
1287 memcpy(dst[2], src[2], src_stride*horizontal_size);
1288 }
1289 }
1290 }
1291
1292 /**
1293 * Copies a block from src to dst and fixes the blacklevel
1294 */
1295 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride)
1296 {
1297 #ifdef HAVE_MMX
1298 asm volatile(
1299 "pushl %0 \n\t"
1300 "pushl %1 \n\t"
1301 "leal (%2,%2), %%eax \n\t"
1302 "leal (%3,%3), %%ebx \n\t"
1303 "movq packedYOffset, %%mm2 \n\t"
1304 "movq packedYScale, %%mm3 \n\t"
1305
1306 #define SIMPLE_CPY \
1307 "movq (%0), %%mm0 \n\t"\
1308 "movq (%0,%2), %%mm1 \n\t"\
1309 "psubusb %%mm2, %%mm0 \n\t"\
1310 "psubusb %%mm2, %%mm1 \n\t"\
1311 "movq %%mm0, (%1) \n\t"\
1312 "movq %%mm1, (%1, %3) \n\t"\
1313
1314 #define SCALED_CPY \
1315 "movq (%0), %%mm0 \n\t"\
1316 "movq (%0,%2), %%mm1 \n\t"\
1317 "psubusb %%mm2, %%mm0 \n\t"\
1318 "psubusb %%mm2, %%mm1 \n\t"\
1319 "pxor %%mm4, %%mm4 \n\t"\
1320 "pxor %%mm5, %%mm5 \n\t"\
1321 "punpcklbw %%mm0, %%mm4 \n\t"\
1322 "punpckhbw %%mm0, %%mm5 \n\t"\
1323 "pmulhuw %%mm3, %%mm4 \n\t"\
1324 "pmulhuw %%mm3, %%mm5 \n\t"\
1325 "packuswb %%mm5, %%mm4 \n\t"\
1326 "movq %%mm4, (%1) \n\t"\
1327 "pxor %%mm4, %%mm4 \n\t"\
1328 "pxor %%mm5, %%mm5 \n\t"\
1329 "punpcklbw %%mm1, %%mm4 \n\t"\
1330 "punpckhbw %%mm1, %%mm5 \n\t"\
1331 "pmulhuw %%mm3, %%mm4 \n\t"\
1332 "pmulhuw %%mm3, %%mm5 \n\t"\
1333 "packuswb %%mm5, %%mm4 \n\t"\
1334 "movq %%mm4, (%1, %3) \n\t"\
1335
1336
1337 #define CPY SCALED_CPY
1338 //#define CPY SIMPLE_CPY
1339 // "prefetchnta 8(%0)\n\t"
1340 CPY
1341 "addl %%eax, %0 \n\t"
1342 "addl %%ebx, %1 \n\t"
1343 CPY
1344 "addl %%eax, %0 \n\t"
1345 "addl %%ebx, %1 \n\t"
1346 CPY
1347 "addl %%eax, %0 \n\t"
1348 "addl %%ebx, %1 \n\t"
1349 CPY
1350 "popl %1 \n\t"
1351 "popl %0 \n\t"
1352 : : "r" (src),
1353 "r" (dst),
1354 "r" (srcStride),
1355 "r" (dstStride)
1356 : "%eax", "%ebx"
1357 );
1358 #else
1359 for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2
1360 memcpy( &(dst[dstStride*i]),
1361 &(src[srcStride*i]), BLOCK_SIZE);
1362 #endif
1363 }
1364
1365
1366 /**
1367 * Filters array of bytes (Y or U or V values)
1368 */
1369 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
1370 QP_STORE_T QPs[], int QPStride, bool isColor)
1371 {
1372
1373 #ifdef TIMEING
1374 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
1375 sumTime= rdtsc();
1376 #endif
1377
1378 /* we need 64bit here otherwise weŽll going to have a problem
1379 after watching a black picture for 5 hours*/
1380 static uint64_t *yHistogram= NULL;
1381 if(!yHistogram)
1382 {
1383 yHistogram= new uint64_t[256];
1384 for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256;
1385 }
1386
1387 int black=0, white=255; // blackest black and whitest white in the picture
1388 if(!isColor)
1389 {
1390 uint64_t sum= 0;
1391 for(int i=0; i<256; i++)
1392 sum+= yHistogram[i];
1393
1394 uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold);
1395
1396 uint64_t clipped= sum;
1397 for(black=255; black>0; black--)
1398 {
1399 if(clipped < maxClipped) break;
1400 clipped-= yHistogram[black];
1401 }
1402
1403 clipped= sum;
1404 for(white=0; white<256; white++)
1405 {
1406 if(clipped < maxClipped) break;
1407 clipped-= yHistogram[white];
1408 }
1409
1410 // we cant handle negative correctures
1411 packedYOffset= MAX(black - minAllowedY, 0);
1412 packedYOffset|= packedYOffset<<32;
1413 packedYOffset|= packedYOffset<<16;
1414 packedYOffset|= packedYOffset<<8;
1415
1416 // uint64_t scale= (int)(256.0*256.0/(white-black) + 0.5);
1417 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
1418
1419 packedYScale= uint16_t(scale*256.0 + 0.5);
1420 packedYScale|= packedYScale<<32;
1421 packedYScale|= packedYScale<<16;
1422 }
1423 else
1424 {
1425 packedYScale= 0x0100010001000100LL;
1426 packedYOffset= 0;
1427 }
1428
1429 for(int x=0; x<width; x+=BLOCK_SIZE)
1430 blockCopy(dst + x, dstStride, src + x, srcStride);
1431
1432 for(int y=0; y<height; y+=BLOCK_SIZE)
1433 {
1434 //1% speedup if these are here instead of the inner loop
1435 uint8_t *srcBlock= &(src[y*srcStride]);
1436 uint8_t *dstBlock= &(dst[y*dstStride]);
1437 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
1438 uint8_t *vertBlock= &(dstBlock[dstStride*3]);
1439
1440 // finish 1 block before the next otherwise weŽll might have a problem
1441 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
1442 for(int x=0; x<width; x+=BLOCK_SIZE)
1443 {
1444 int QP= isColor ?
1445 QPs[(y>>3)*QPStride + (x>>3)]:
1446 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
1447 #ifdef HAVE_MMX
1448 asm volatile(
1449 "movd %0, %%mm7 \n\t"
1450 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
1451 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
1452 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
1453 "movq %%mm7, pQPb \n\t"
1454 : : "r" (QP)
1455 );
1456 #endif
1457
1458
1459 const int stride= dstStride;
1460 if(y + 12 < height)
1461 {
1462 #ifdef MORE_TIMEING
1463 T0= rdtsc();
1464 #endif
1465 #ifdef HAVE_MMX2
1466
1467 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
1468 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
1469 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
1470 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
1471 #endif
1472 if(!isColor) yHistogram[ srcBlock[0] ]++;
1473
1474 blockCopy(vertBlock + dstStride*2, dstStride,
1475 vertSrcBlock + srcStride*2, srcStride);
1476
1477
1478 #ifdef MORE_TIMEING
1479 T1= rdtsc();
1480 memcpyTime+= T1-T0;
1481 T0=T1;
1482 #endif
1483
1484 if( isVertDC(vertBlock, stride))
1485 {
1486 if(isVertMinMaxOk(vertBlock, stride, QP))
1487 doVertLowPass(vertBlock, stride, QP);
1488 }
1489 else if(x<width)
1490 doVertDefFilter(vertBlock, stride, QP);
1491
1492 #ifdef MORE_TIMEING
1493 T1= rdtsc();
1494 vertTime+= T1-T0;
1495 T0=T1;
1496 #endif
1497 }
1498 else
1499 {
1500 for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2
1501 memcpy( &(vertBlock[dstStride*i]),
1502 &(vertSrcBlock[srcStride*i]), BLOCK_SIZE);
1503
1504 }
1505
1506 if(x - 8 >= 0 && x<width)
1507 {
1508 #ifdef MORE_TIMEING
1509 T0= rdtsc();
1510 #endif
1511
1512 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1513 {
1514 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
1515 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
1516 }
1517 else
1518 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1519
1520 #ifdef MORE_TIMEING
1521 T1= rdtsc();
1522 horizTime+= T1-T0;
1523 T0=T1;
1524 #endif
1525 dering(dstBlock - 9 - stride, stride, QP);
1526 }
1527 else if(y!=0)
1528 dering(dstBlock - stride*9 + width-9, stride, QP);
1529 //FIXME dering filter will not be applied to last block (bottom right)
1530
1531
1532 dstBlock+=8;
1533 srcBlock+=8;
1534 vertBlock+=8;
1535 vertSrcBlock+=8;
1536 }
1537 }
1538 #ifdef HAVE_MMX
1539 asm volatile("emms");
1540 #endif
1541
1542 #ifdef TIMEING
1543 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
1544 sumTime= rdtsc() - sumTime;
1545 if(!isColor)
1546 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
1547 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000),
1548 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000)
1549 , black, white);
1550 #endif
1551 }