comparison postprocess_template.c @ 118:bdd1788fb53b libpostproc

Change semantic of CONFIG_*, HAVE_* and ARCH_*. They are now always defined to either 0 or 1.
author aurel
date Tue, 13 Jan 2009 23:44:16 +0000
parents bf8f52662dc3
children 4a1602d552aa
comparison
equal deleted inserted replaced
117:3a76063f4145 118:bdd1788fb53b
29 29
30 #undef PAVGB 30 #undef PAVGB
31 #undef PMINUB 31 #undef PMINUB
32 #undef PMAXUB 32 #undef PMAXUB
33 33
34 #ifdef HAVE_MMX2 34 #if HAVE_MMX2
35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 #elif defined (HAVE_3DNOW) 36 #elif HAVE_3DNOW
37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38 #endif 38 #endif
39 #define PAVGB(a,b) REAL_PAVGB(a,b) 39 #define PAVGB(a,b) REAL_PAVGB(a,b)
40 40
41 #ifdef HAVE_MMX2 41 #if HAVE_MMX2
42 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 42 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
43 #elif defined (HAVE_MMX) 43 #elif HAVE_MMX
44 #define PMINUB(b,a,t) \ 44 #define PMINUB(b,a,t) \
45 "movq " #a ", " #t " \n\t"\ 45 "movq " #a ", " #t " \n\t"\
46 "psubusb " #b ", " #t " \n\t"\ 46 "psubusb " #b ", " #t " \n\t"\
47 "psubb " #t ", " #a " \n\t" 47 "psubb " #t ", " #a " \n\t"
48 #endif 48 #endif
49 49
50 #ifdef HAVE_MMX2 50 #if HAVE_MMX2
51 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 51 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
52 #elif defined (HAVE_MMX) 52 #elif HAVE_MMX
53 #define PMAXUB(a,b) \ 53 #define PMAXUB(a,b) \
54 "psubusb " #a ", " #b " \n\t"\ 54 "psubusb " #a ", " #b " \n\t"\
55 "paddb " #a ", " #b " \n\t" 55 "paddb " #a ", " #b " \n\t"
56 #endif 56 #endif
57 57
58 //FIXME? |255-0| = 1 (should not be a problem ...) 58 //FIXME? |255-0| = 1 (should not be a problem ...)
59 #ifdef HAVE_MMX 59 #if HAVE_MMX
60 /** 60 /**
61 * Check if the middle 8x8 Block in the given 8x16 block is flat 61 * Check if the middle 8x8 Block in the given 8x16 block is flat
62 */ 62 */
63 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 63 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
64 int numEq= 0, dcOk; 64 int numEq= 0, dcOk;
134 "pcmpgtb %%mm6, %%mm2 \n\t" 134 "pcmpgtb %%mm6, %%mm2 \n\t"
135 "paddb %%mm2, %%mm0 \n\t" 135 "paddb %%mm2, %%mm0 \n\t"
136 "psubusb %%mm3, %%mm4 \n\t" 136 "psubusb %%mm3, %%mm4 \n\t"
137 137
138 " \n\t" 138 " \n\t"
139 #ifdef HAVE_MMX2 139 #if HAVE_MMX2
140 "pxor %%mm7, %%mm7 \n\t" 140 "pxor %%mm7, %%mm7 \n\t"
141 "psadbw %%mm7, %%mm0 \n\t" 141 "psadbw %%mm7, %%mm0 \n\t"
142 #else 142 #else
143 "movq %%mm0, %%mm1 \n\t" 143 "movq %%mm0, %%mm1 \n\t"
144 "psrlw $8, %%mm0 \n\t" 144 "psrlw $8, %%mm0 \n\t"
174 174
175 /** 175 /**
176 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) 176 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
178 */ 178 */
179 #ifndef HAVE_ALTIVEC 179 #if !HAVE_ALTIVEC
180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
181 { 181 {
182 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 182 #if HAVE_MMX2 || HAVE_3DNOW
183 src+= stride*3; 183 src+= stride*3;
184 __asm__ volatile( //"movv %0 %1 %2\n\t" 184 __asm__ volatile( //"movv %0 %1 %2\n\t"
185 "movq %2, %%mm0 \n\t" // QP,..., QP 185 "movq %2, %%mm0 \n\t" // QP,..., QP
186 "pxor %%mm4, %%mm4 \n\t" 186 "pxor %%mm4, %%mm4 \n\t"
187 187
304 304
305 : 305 :
306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
307 : "%"REG_a, "%"REG_c 307 : "%"REG_a, "%"REG_c
308 ); 308 );
309 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 309 #else //HAVE_MMX2 || HAVE_3DNOW
310 const int l1= stride; 310 const int l1= stride;
311 const int l2= stride + l1; 311 const int l2= stride + l1;
312 const int l3= stride + l2; 312 const int l3= stride + l2;
313 const int l4= stride + l3; 313 const int l4= stride + l3;
314 const int l5= stride + l4; 314 const int l5= stride + l4;
343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
345 345
346 src++; 346 src++;
347 } 347 }
348 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 348 #endif //HAVE_MMX2 || HAVE_3DNOW
349 } 349 }
350 #endif //HAVE_ALTIVEC 350 #endif //HAVE_ALTIVEC
351 351
352 #if 0 352 #if 0
353 /** 353 /**
362 x/8 = 1 362 x/8 = 1
363 1 12 12 23 363 1 12 12 23
364 */ 364 */
365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) 365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
366 { 366 {
367 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 367 #if HAVE_MMX2 || HAVE_3DNOW
368 src+= stride*3; 368 src+= stride*3;
369 // FIXME rounding 369 // FIXME rounding
370 __asm__ volatile( 370 __asm__ volatile(
371 "pxor %%mm7, %%mm7 \n\t" // 0 371 "pxor %%mm7, %%mm7 \n\t" // 0
372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE 372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
424 424
425 : 425 :
426 : "r" (src), "r" ((x86_reg)stride) 426 : "r" (src), "r" ((x86_reg)stride)
427 : "%"REG_a, "%"REG_c 427 : "%"REG_a, "%"REG_c
428 ); 428 );
429 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 429 #else //HAVE_MMX2 || HAVE_3DNOW
430 const int l1= stride; 430 const int l1= stride;
431 const int l2= stride + l1; 431 const int l2= stride + l1;
432 const int l3= stride + l2; 432 const int l3= stride + l2;
433 const int l4= stride + l3; 433 const int l4= stride + l3;
434 const int l5= stride + l4; 434 const int l5= stride + l4;
447 src[x+l5] -=v>>1; 447 src[x+l5] -=v>>1;
448 src[x+l6] -=v>>3; 448 src[x+l6] -=v>>3;
449 } 449 }
450 } 450 }
451 451
452 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 452 #endif //HAVE_MMX2 || HAVE_3DNOW
453 } 453 }
454 #endif //0 454 #endif //0
455 455
456 /** 456 /**
457 * Experimental Filter 1 457 * Experimental Filter 1
460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) 460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
461 * MMX2 version does correct clipping C version does not 461 * MMX2 version does correct clipping C version does not
462 */ 462 */
463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
464 { 464 {
465 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 465 #if HAVE_MMX2 || HAVE_3DNOW
466 src+= stride*3; 466 src+= stride*3;
467 467
468 __asm__ volatile( 468 __asm__ volatile(
469 "pxor %%mm7, %%mm7 \n\t" // 0 469 "pxor %%mm7, %%mm7 \n\t" // 0
470 "lea (%0, %1), %%"REG_a" \n\t" 470 "lea (%0, %1), %%"REG_a" \n\t"
546 546
547 : 547 :
548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) 548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
549 : "%"REG_a, "%"REG_c 549 : "%"REG_a, "%"REG_c
550 ); 550 );
551 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 551 #else //HAVE_MMX2 || HAVE_3DNOW
552 552
553 const int l1= stride; 553 const int l1= stride;
554 const int l2= stride + l1; 554 const int l2= stride + l1;
555 const int l3= stride + l2; 555 const int l3= stride + l2;
556 const int l4= stride + l3; 556 const int l4= stride + l3;
580 src[l6] -=v>>2; 580 src[l6] -=v>>2;
581 src[l7] -=v>>3; 581 src[l7] -=v>>3;
582 } 582 }
583 src++; 583 src++;
584 } 584 }
585 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 585 #endif //HAVE_MMX2 || HAVE_3DNOW
586 } 586 }
587 587
588 #ifndef HAVE_ALTIVEC 588 #if !HAVE_ALTIVEC
589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
590 { 590 {
591 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 591 #if HAVE_MMX2 || HAVE_3DNOW
592 /* 592 /*
593 uint8_t tmp[16]; 593 uint8_t tmp[16];
594 const int l1= stride; 594 const int l1= stride;
595 const int l2= stride + l1; 595 const int l2= stride + l1;
596 const int l3= stride + l2; 596 const int l3= stride + l2;
867 } 867 }
868 } 868 }
869 } 869 }
870 } 870 }
871 */ 871 */
872 #elif defined (HAVE_MMX) 872 #elif HAVE_MMX
873 src+= stride*4; 873 src+= stride*4;
874 __asm__ volatile( 874 __asm__ volatile(
875 "pxor %%mm7, %%mm7 \n\t" 875 "pxor %%mm7, %%mm7 \n\t"
876 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 876 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
877 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 877 "and "ALIGN_MASK", %%"REG_c" \n\t" // align
976 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 976 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
977 977
978 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 978 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
979 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 979 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
980 980
981 #ifdef HAVE_MMX2 981 #if HAVE_MMX2
982 "movq %%mm7, %%mm6 \n\t" // 0 982 "movq %%mm7, %%mm6 \n\t" // 0
983 "psubw %%mm0, %%mm6 \n\t" 983 "psubw %%mm0, %%mm6 \n\t"
984 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 984 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
985 "movq %%mm7, %%mm6 \n\t" // 0 985 "movq %%mm7, %%mm6 \n\t" // 0
986 "psubw %%mm1, %%mm6 \n\t" 986 "psubw %%mm1, %%mm6 \n\t"
1008 "pcmpgtw %%mm3, %%mm6 \n\t" 1008 "pcmpgtw %%mm3, %%mm6 \n\t"
1009 "pxor %%mm6, %%mm3 \n\t" 1009 "pxor %%mm6, %%mm3 \n\t"
1010 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 1010 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1011 #endif 1011 #endif
1012 1012
1013 #ifdef HAVE_MMX2 1013 #if HAVE_MMX2
1014 "pminsw %%mm2, %%mm0 \n\t" 1014 "pminsw %%mm2, %%mm0 \n\t"
1015 "pminsw %%mm3, %%mm1 \n\t" 1015 "pminsw %%mm3, %%mm1 \n\t"
1016 #else 1016 #else
1017 "movq %%mm0, %%mm6 \n\t" 1017 "movq %%mm0, %%mm6 \n\t"
1018 "psubusw %%mm2, %%mm6 \n\t" 1018 "psubusw %%mm2, %%mm6 \n\t"
1072 "pxor %%mm6, %%mm2 \n\t" 1072 "pxor %%mm6, %%mm2 \n\t"
1073 "pxor %%mm7, %%mm3 \n\t" 1073 "pxor %%mm7, %%mm3 \n\t"
1074 "pand %%mm2, %%mm4 \n\t" 1074 "pand %%mm2, %%mm4 \n\t"
1075 "pand %%mm3, %%mm5 \n\t" 1075 "pand %%mm3, %%mm5 \n\t"
1076 1076
1077 #ifdef HAVE_MMX2 1077 #if HAVE_MMX2
1078 "pminsw %%mm0, %%mm4 \n\t" 1078 "pminsw %%mm0, %%mm4 \n\t"
1079 "pminsw %%mm1, %%mm5 \n\t" 1079 "pminsw %%mm1, %%mm5 \n\t"
1080 #else 1080 #else
1081 "movq %%mm4, %%mm2 \n\t" 1081 "movq %%mm4, %%mm2 \n\t"
1082 "psubusw %%mm0, %%mm2 \n\t" 1082 "psubusw %%mm0, %%mm2 \n\t"
1099 1099
1100 : "+r" (src) 1100 : "+r" (src)
1101 : "r" ((x86_reg)stride), "m" (c->pQPb) 1101 : "r" ((x86_reg)stride), "m" (c->pQPb)
1102 : "%"REG_a, "%"REG_c 1102 : "%"REG_a, "%"REG_c
1103 ); 1103 );
1104 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1104 #else //HAVE_MMX2 || HAVE_3DNOW
1105 const int l1= stride; 1105 const int l1= stride;
1106 const int l2= stride + l1; 1106 const int l2= stride + l1;
1107 const int l3= stride + l2; 1107 const int l3= stride + l2;
1108 const int l4= stride + l3; 1108 const int l4= stride + l3;
1109 const int l5= stride + l4; 1109 const int l5= stride + l4;
1137 src[l4]-= d; 1137 src[l4]-= d;
1138 src[l5]+= d; 1138 src[l5]+= d;
1139 } 1139 }
1140 src++; 1140 src++;
1141 } 1141 }
1142 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1142 #endif //HAVE_MMX2 || HAVE_3DNOW
1143 } 1143 }
1144 #endif //HAVE_ALTIVEC 1144 #endif //HAVE_ALTIVEC
1145 1145
1146 #ifndef HAVE_ALTIVEC 1146 #if !HAVE_ALTIVEC
1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1148 { 1148 {
1149 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1149 #if HAVE_MMX2 || HAVE_3DNOW
1150 __asm__ volatile( 1150 __asm__ volatile(
1151 "pxor %%mm6, %%mm6 \n\t" 1151 "pxor %%mm6, %%mm6 \n\t"
1152 "pcmpeqb %%mm7, %%mm7 \n\t" 1152 "pcmpeqb %%mm7, %%mm7 \n\t"
1153 "movq %2, %%mm0 \n\t" 1153 "movq %2, %%mm0 \n\t"
1154 "punpcklbw %%mm6, %%mm0 \n\t" 1154 "punpcklbw %%mm6, %%mm0 \n\t"
1162 1162
1163 // 0 1 2 3 4 5 6 7 8 9 1163 // 0 1 2 3 4 5 6 7 8 9
1164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1165 1165
1166 #undef FIND_MIN_MAX 1166 #undef FIND_MIN_MAX
1167 #ifdef HAVE_MMX2 1167 #if HAVE_MMX2
1168 #define REAL_FIND_MIN_MAX(addr)\ 1168 #define REAL_FIND_MIN_MAX(addr)\
1169 "movq " #addr ", %%mm0 \n\t"\ 1169 "movq " #addr ", %%mm0 \n\t"\
1170 "pminub %%mm0, %%mm7 \n\t"\ 1170 "pminub %%mm0, %%mm7 \n\t"\
1171 "pmaxub %%mm0, %%mm6 \n\t" 1171 "pmaxub %%mm0, %%mm6 \n\t"
1172 #else 1172 #else
1189 FIND_MIN_MAX((%%REGd, %1, 2)) 1189 FIND_MIN_MAX((%%REGd, %1, 2))
1190 FIND_MIN_MAX((%0, %1, 8)) 1190 FIND_MIN_MAX((%0, %1, 8))
1191 1191
1192 "movq %%mm7, %%mm4 \n\t" 1192 "movq %%mm7, %%mm4 \n\t"
1193 "psrlq $8, %%mm7 \n\t" 1193 "psrlq $8, %%mm7 \n\t"
1194 #ifdef HAVE_MMX2 1194 #if HAVE_MMX2
1195 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1195 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1196 "pshufw $0xF9, %%mm7, %%mm4 \n\t" 1196 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels 1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels
1198 "pshufw $0xFE, %%mm7, %%mm4 \n\t" 1198 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1199 "pminub %%mm4, %%mm7 \n\t" 1199 "pminub %%mm4, %%mm7 \n\t"
1214 #endif 1214 #endif
1215 1215
1216 1216
1217 "movq %%mm6, %%mm4 \n\t" 1217 "movq %%mm6, %%mm4 \n\t"
1218 "psrlq $8, %%mm6 \n\t" 1218 "psrlq $8, %%mm6 \n\t"
1219 #ifdef HAVE_MMX2 1219 #if HAVE_MMX2
1220 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels 1220 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1221 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 1221 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1222 "pmaxub %%mm4, %%mm6 \n\t" 1222 "pmaxub %%mm4, %%mm6 \n\t"
1223 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 1223 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1224 "pmaxub %%mm4, %%mm6 \n\t" 1224 "pmaxub %%mm4, %%mm6 \n\t"
1368 1368
1369 "1: \n\t" 1369 "1: \n\t"
1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) 1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2)
1371 : "%"REG_a, "%"REG_d, "%"REG_c 1371 : "%"REG_a, "%"REG_d, "%"REG_c
1372 ); 1372 );
1373 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1373 #else //HAVE_MMX2 || HAVE_3DNOW
1374 int y; 1374 int y;
1375 int min=255; 1375 int min=255;
1376 int max=0; 1376 int max=0;
1377 int avg; 1377 int avg;
1378 uint8_t *p; 1378 uint8_t *p;
1485 } 1485 }
1486 } 1486 }
1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; 1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1488 } 1488 }
1489 #endif 1489 #endif
1490 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1490 #endif //HAVE_MMX2 || HAVE_3DNOW
1491 } 1491 }
1492 #endif //HAVE_ALTIVEC 1492 #endif //HAVE_ALTIVEC
1493 1493
1494 /** 1494 /**
1495 * Deinterlaces the given block by linearly interpolating every second line. 1495 * Deinterlaces the given block by linearly interpolating every second line.
1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1499 */ 1499 */
1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1501 { 1501 {
1502 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1502 #if HAVE_MMX2 || HAVE_3DNOW
1503 src+= 4*stride; 1503 src+= 4*stride;
1504 __asm__ volatile( 1504 __asm__ volatile(
1505 "lea (%0, %1), %%"REG_a" \n\t" 1505 "lea (%0, %1), %%"REG_a" \n\t"
1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
1507 // 0 1 2 3 4 5 6 7 8 9 1507 // 0 1 2 3 4 5 6 7 8 9
1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1551 * this filter will read lines 3-15 and write 7-13 1551 * this filter will read lines 3-15 and write 7-13
1552 */ 1552 */
1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1554 { 1554 {
1555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1555 #if HAVE_MMX2 || HAVE_3DNOW
1556 src+= stride*3; 1556 src+= stride*3;
1557 __asm__ volatile( 1557 __asm__ volatile(
1558 "lea (%0, %1), %%"REG_a" \n\t" 1558 "lea (%0, %1), %%"REG_a" \n\t"
1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) 1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
1593 1593
1594 : : "r" (src), "r" ((x86_reg)stride) 1594 : : "r" (src), "r" ((x86_reg)stride)
1595 : "%"REG_a, "%"REG_d, "%"REG_c 1595 : "%"REG_a, "%"REG_d, "%"REG_c
1596 ); 1596 );
1597 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1597 #else //HAVE_MMX2 || HAVE_3DNOW
1598 int x; 1598 int x;
1599 src+= stride*3; 1599 src+= stride*3;
1600 for(x=0; x<8; x++){ 1600 for(x=0; x<8; x++){
1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1605 src++; 1605 src++;
1606 } 1606 }
1607 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1607 #endif //HAVE_MMX2 || HAVE_3DNOW
1608 } 1608 }
1609 1609
1610 /** 1610 /**
1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. 1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1612 * will be called for every 8x8 block and can read & write from line 4-15 1612 * will be called for every 8x8 block and can read & write from line 4-15
1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1615 * this filter will read lines 4-13 and write 5-11 1615 * this filter will read lines 4-13 and write 5-11
1616 */ 1616 */
1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1618 { 1618 {
1619 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1619 #if HAVE_MMX2 || HAVE_3DNOW
1620 src+= stride*4; 1620 src+= stride*4;
1621 __asm__ volatile( 1621 __asm__ volatile(
1622 "lea (%0, %1), %%"REG_a" \n\t" 1622 "lea (%0, %1), %%"REG_a" \n\t"
1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1624 "pxor %%mm7, %%mm7 \n\t" 1624 "pxor %%mm7, %%mm7 \n\t"
1663 1663
1664 "movq %%mm0, (%2) \n\t" 1664 "movq %%mm0, (%2) \n\t"
1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) 1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1666 : "%"REG_a, "%"REG_d 1666 : "%"REG_a, "%"REG_d
1667 ); 1667 );
1668 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1668 #else //HAVE_MMX2 || HAVE_3DNOW
1669 int x; 1669 int x;
1670 src+= stride*4; 1670 src+= stride*4;
1671 for(x=0; x<8; x++){ 1671 for(x=0; x<8; x++){
1672 int t1= tmp[x]; 1672 int t1= tmp[x];
1673 int t2= src[stride*1]; 1673 int t2= src[stride*1];
1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1682 tmp[x]= t1; 1682 tmp[x]= t1;
1683 1683
1684 src++; 1684 src++;
1685 } 1685 }
1686 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1686 #endif //HAVE_MMX2 || HAVE_3DNOW
1687 } 1687 }
1688 1688
1689 /** 1689 /**
1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. 1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1691 * will be called for every 8x8 block and can read & write from line 4-15 1691 * will be called for every 8x8 block and can read & write from line 4-15
1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1694 * this filter will read lines 4-13 and write 4-11 1694 * this filter will read lines 4-13 and write 4-11
1695 */ 1695 */
1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1697 { 1697 {
1698 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1698 #if HAVE_MMX2 || HAVE_3DNOW
1699 src+= stride*4; 1699 src+= stride*4;
1700 __asm__ volatile( 1700 __asm__ volatile(
1701 "lea (%0, %1), %%"REG_a" \n\t" 1701 "lea (%0, %1), %%"REG_a" \n\t"
1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1703 "pxor %%mm7, %%mm7 \n\t" 1703 "pxor %%mm7, %%mm7 \n\t"
1753 "movq %%mm0, (%2) \n\t" 1753 "movq %%mm0, (%2) \n\t"
1754 "movq %%mm1, (%3) \n\t" 1754 "movq %%mm1, (%3) \n\t"
1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) 1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1756 : "%"REG_a, "%"REG_d 1756 : "%"REG_a, "%"REG_d
1757 ); 1757 );
1758 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1758 #else //HAVE_MMX2 || HAVE_3DNOW
1759 int x; 1759 int x;
1760 src+= stride*4; 1760 src+= stride*4;
1761 for(x=0; x<8; x++){ 1761 for(x=0; x<8; x++){
1762 int t1= tmp[x]; 1762 int t1= tmp[x];
1763 int t2= tmp2[x]; 1763 int t2= tmp2[x];
1782 tmp[x]= t3; 1782 tmp[x]= t3;
1783 tmp2[x]= t1; 1783 tmp2[x]= t1;
1784 1784
1785 src++; 1785 src++;
1786 } 1786 }
1787 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1787 #endif //HAVE_MMX2 || HAVE_3DNOW
1788 } 1788 }
1789 1789
1790 /** 1790 /**
1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. 1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1792 * will be called for every 8x8 block and can read & write from line 4-15 1792 * will be called for every 8x8 block and can read & write from line 4-15
1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1795 * this filter will read lines 4-13 and write 4-11 1795 * this filter will read lines 4-13 and write 4-11
1796 */ 1796 */
1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1798 { 1798 {
1799 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1799 #if HAVE_MMX2 || HAVE_3DNOW
1800 src+= 4*stride; 1800 src+= 4*stride;
1801 __asm__ volatile( 1801 __asm__ volatile(
1802 "lea (%0, %1), %%"REG_a" \n\t" 1802 "lea (%0, %1), %%"REG_a" \n\t"
1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1804 // 0 1 2 3 4 5 6 7 8 9 1804 // 0 1 2 3 4 5 6 7 8 9
1841 "movq %%mm1, (%2) \n\t" 1841 "movq %%mm1, (%2) \n\t"
1842 1842
1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) 1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1844 : "%"REG_a, "%"REG_d 1844 : "%"REG_a, "%"REG_d
1845 ); 1845 );
1846 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1846 #else //HAVE_MMX2 || HAVE_3DNOW
1847 int a, b, c, x; 1847 int a, b, c, x;
1848 src+= 4*stride; 1848 src+= 4*stride;
1849 1849
1850 for(x=0; x<2; x++){ 1850 for(x=0; x<2; x++){
1851 a= *(uint32_t*)&tmp[stride*0]; 1851 a= *(uint32_t*)&tmp[stride*0];
1884 1884
1885 *(uint32_t*)&tmp[stride*0]= c; 1885 *(uint32_t*)&tmp[stride*0]= c;
1886 src += 4; 1886 src += 4;
1887 tmp += 4; 1887 tmp += 4;
1888 } 1888 }
1889 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1889 #endif //HAVE_MMX2 || HAVE_3DNOW
1890 } 1890 }
1891 1891
1892 /** 1892 /**
1893 * Deinterlaces the given block by applying a median filter to every second line. 1893 * Deinterlaces the given block by applying a median filter to every second line.
1894 * will be called for every 8x8 block and can read & write from line 4-15, 1894 * will be called for every 8x8 block and can read & write from line 4-15,
1895 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. 1895 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1896 * lines 4-12 will be read into the deblocking filter and should be deinterlaced 1896 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1897 */ 1897 */
1898 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) 1898 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1899 { 1899 {
1900 #ifdef HAVE_MMX 1900 #ifd HAVE_MMX
1901 src+= 4*stride; 1901 src+= 4*stride;
1902 #ifdef HAVE_MMX2 1902 #if HAVE_MMX2
1903 __asm__ volatile( 1903 __asm__ volatile(
1904 "lea (%0, %1), %%"REG_a" \n\t" 1904 "lea (%0, %1), %%"REG_a" \n\t"
1905 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 1905 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1906 // 0 1 2 3 4 5 6 7 8 9 1906 // 0 1 2 3 4 5 6 7 8 9
1907 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 1907 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2010 src++; 2010 src++;
2011 } 2011 }
2012 #endif //HAVE_MMX 2012 #endif //HAVE_MMX
2013 } 2013 }
2014 2014
2015 #ifdef HAVE_MMX 2015 #if HAVE_MMX
2016 /** 2016 /**
2017 * transposes and shift the given 8x8 Block into dst1 and dst2 2017 * transposes and shift the given 8x8 Block into dst1 and dst2
2018 */ 2018 */
2019 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 2019 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2020 { 2020 {
2178 ); 2178 );
2179 } 2179 }
2180 #endif //HAVE_MMX 2180 #endif //HAVE_MMX
2181 //static long test=0; 2181 //static long test=0;
2182 2182
2183 #ifndef HAVE_ALTIVEC 2183 #if !HAVE_ALTIVEC
2184 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 2184 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2185 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) 2185 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
2186 { 2186 {
2187 // to save a register (FIXME do this outside of the loops) 2187 // to save a register (FIXME do this outside of the loops)
2188 tempBlurredPast[127]= maxNoise[0]; 2188 tempBlurredPast[127]= maxNoise[0];
2189 tempBlurredPast[128]= maxNoise[1]; 2189 tempBlurredPast[128]= maxNoise[1];
2190 tempBlurredPast[129]= maxNoise[2]; 2190 tempBlurredPast[129]= maxNoise[2];
2191 2191
2192 #define FAST_L2_DIFF 2192 #define FAST_L2_DIFF
2193 //#define L1_DIFF //u should change the thresholds too if u try that one 2193 //#define L1_DIFF //u should change the thresholds too if u try that one
2194 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2194 #if HAVE_MMX2 || HAVE_3DNOW
2195 __asm__ volatile( 2195 __asm__ volatile(
2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride
2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride
2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2199 // 0 1 2 3 4 5 6 7 8 9 2199 // 0 1 2 3 4 5 6 7 8 9
2477 "4: \n\t" 2477 "4: \n\t"
2478 2478
2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) 2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2481 ); 2481 );
2482 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2482 #else //HAVE_MMX2 || HAVE_3DNOW
2483 { 2483 {
2484 int y; 2484 int y;
2485 int d=0; 2485 int d=0;
2486 // int sysd=0; 2486 // int sysd=0;
2487 int i; 2487 int i;
2560 } 2560 }
2561 } 2561 }
2562 } 2562 }
2563 } 2563 }
2564 } 2564 }
2565 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) 2565 #endif //HAVE_MMX2 || HAVE_3DNOW
2566 } 2566 }
2567 #endif //HAVE_ALTIVEC 2567 #endif //HAVE_ALTIVEC
2568 2568
2569 #ifdef HAVE_MMX 2569 #if HAVE_MMX
2570 /** 2570 /**
2571 * accurate deblock filter 2571 * accurate deblock filter
2572 */ 2572 */
2573 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 2573 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2574 int64_t dc_mask, eq_mask, both_masks; 2574 int64_t dc_mask, eq_mask, both_masks;
2968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 2968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2969 2969
2970 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 2970 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2971 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 2971 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2972 2972
2973 #ifdef HAVE_MMX2 2973 #if HAVE_MMX2
2974 "movq %%mm7, %%mm6 \n\t" // 0 2974 "movq %%mm7, %%mm6 \n\t" // 0
2975 "psubw %%mm0, %%mm6 \n\t" 2975 "psubw %%mm0, %%mm6 \n\t"
2976 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 2976 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2977 "movq %%mm7, %%mm6 \n\t" // 0 2977 "movq %%mm7, %%mm6 \n\t" // 0
2978 "psubw %%mm1, %%mm6 \n\t" 2978 "psubw %%mm1, %%mm6 \n\t"
3000 "pcmpgtw %%mm3, %%mm6 \n\t" 3000 "pcmpgtw %%mm3, %%mm6 \n\t"
3001 "pxor %%mm6, %%mm3 \n\t" 3001 "pxor %%mm6, %%mm3 \n\t"
3002 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 3002 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3003 #endif 3003 #endif
3004 3004
3005 #ifdef HAVE_MMX2 3005 #if HAVE_MMX2
3006 "pminsw %%mm2, %%mm0 \n\t" 3006 "pminsw %%mm2, %%mm0 \n\t"
3007 "pminsw %%mm3, %%mm1 \n\t" 3007 "pminsw %%mm3, %%mm1 \n\t"
3008 #else 3008 #else
3009 "movq %%mm0, %%mm6 \n\t" 3009 "movq %%mm0, %%mm6 \n\t"
3010 "psubusw %%mm2, %%mm6 \n\t" 3010 "psubusw %%mm2, %%mm6 \n\t"
3064 "pxor %%mm6, %%mm2 \n\t" 3064 "pxor %%mm6, %%mm2 \n\t"
3065 "pxor %%mm7, %%mm3 \n\t" 3065 "pxor %%mm7, %%mm3 \n\t"
3066 "pand %%mm2, %%mm4 \n\t" 3066 "pand %%mm2, %%mm4 \n\t"
3067 "pand %%mm3, %%mm5 \n\t" 3067 "pand %%mm3, %%mm5 \n\t"
3068 3068
3069 #ifdef HAVE_MMX2 3069 #if HAVE_MMX2
3070 "pminsw %%mm0, %%mm4 \n\t" 3070 "pminsw %%mm0, %%mm4 \n\t"
3071 "pminsw %%mm1, %%mm5 \n\t" 3071 "pminsw %%mm1, %%mm5 \n\t"
3072 #else 3072 #else
3073 "movq %%mm4, %%mm2 \n\t" 3073 "movq %%mm4, %%mm2 \n\t"
3074 "psubusw %%mm0, %%mm2 \n\t" 3074 "psubusw %%mm0, %%mm2 \n\t"
3114 #undef SCALED_CPY 3114 #undef SCALED_CPY
3115 3115
3116 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, 3116 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3117 int levelFix, int64_t *packedOffsetAndScale) 3117 int levelFix, int64_t *packedOffsetAndScale)
3118 { 3118 {
3119 #ifndef HAVE_MMX 3119 #if !HAVE_MMX
3120 int i; 3120 int i;
3121 #endif 3121 #endif
3122 if(levelFix){ 3122 if(levelFix){
3123 #ifdef HAVE_MMX 3123 #if HAVE_MMX
3124 __asm__ volatile( 3124 __asm__ volatile(
3125 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset 3125 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
3126 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale 3126 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
3127 "lea (%2,%4), %%"REG_a" \n\t" 3127 "lea (%2,%4), %%"REG_a" \n\t"
3128 "lea (%3,%5), %%"REG_d" \n\t" 3128 "lea (%3,%5), %%"REG_d" \n\t"
3129 "pxor %%mm4, %%mm4 \n\t" 3129 "pxor %%mm4, %%mm4 \n\t"
3130 #ifdef HAVE_MMX2 3130 #if HAVE_MMX2
3131 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 3131 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3132 "movq " #src1 ", %%mm0 \n\t"\ 3132 "movq " #src1 ", %%mm0 \n\t"\
3133 "movq " #src1 ", %%mm5 \n\t"\ 3133 "movq " #src1 ", %%mm5 \n\t"\
3134 "movq " #src2 ", %%mm1 \n\t"\ 3134 "movq " #src2 ", %%mm1 \n\t"\
3135 "movq " #src2 ", %%mm6 \n\t"\ 3135 "movq " #src2 ", %%mm6 \n\t"\
3201 for(i=0; i<8; i++) 3201 for(i=0; i<8; i++)
3202 memcpy( &(dst[dstStride*i]), 3202 memcpy( &(dst[dstStride*i]),
3203 &(src[srcStride*i]), BLOCK_SIZE); 3203 &(src[srcStride*i]), BLOCK_SIZE);
3204 #endif //HAVE_MMX 3204 #endif //HAVE_MMX
3205 }else{ 3205 }else{
3206 #ifdef HAVE_MMX 3206 #if HAVE_MMX
3207 __asm__ volatile( 3207 __asm__ volatile(
3208 "lea (%0,%2), %%"REG_a" \n\t" 3208 "lea (%0,%2), %%"REG_a" \n\t"
3209 "lea (%1,%3), %%"REG_d" \n\t" 3209 "lea (%1,%3), %%"REG_d" \n\t"
3210 3210
3211 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ 3211 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3241 /** 3241 /**
3242 * Duplicates the given 8 src pixels ? times upward 3242 * Duplicates the given 8 src pixels ? times upward
3243 */ 3243 */
3244 static inline void RENAME(duplicate)(uint8_t src[], int stride) 3244 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3245 { 3245 {
3246 #ifdef HAVE_MMX 3246 #if HAVE_MMX
3247 __asm__ volatile( 3247 __asm__ volatile(
3248 "movq (%0), %%mm0 \n\t" 3248 "movq (%0), %%mm0 \n\t"
3249 "add %1, %0 \n\t" 3249 "add %1, %0 \n\t"
3250 "movq %%mm0, (%0) \n\t" 3250 "movq %%mm0, (%0) \n\t"
3251 "movq %%mm0, (%0, %1) \n\t" 3251 "movq %%mm0, (%0, %1) \n\t"
3278 #endif 3278 #endif
3279 int black=0, white=255; // blackest black and whitest white in the picture 3279 int black=0, white=255; // blackest black and whitest white in the picture
3280 int QPCorrecture= 256*256; 3280 int QPCorrecture= 256*256;
3281 3281
3282 int copyAhead; 3282 int copyAhead;
3283 #ifdef HAVE_MMX 3283 #if HAVE_MMX
3284 int i; 3284 int i;
3285 #endif 3285 #endif
3286 3286
3287 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; 3287 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3288 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; 3288 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3291 uint64_t * const yHistogram= c.yHistogram; 3291 uint64_t * const yHistogram= c.yHistogram;
3292 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; 3292 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3293 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; 3293 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3294 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; 3294 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3295 3295
3296 #ifdef HAVE_MMX 3296 #if HAVE_MMX
3297 for(i=0; i<57; i++){ 3297 for(i=0; i<57; i++){
3298 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; 3298 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3299 int threshold= offset*2 + 1; 3299 int threshold= offset*2 + 1;
3300 c.mmxDcOffset[i]= 0x7F - offset; 3300 c.mmxDcOffset[i]= 0x7F - offset;
3301 c.mmxDcThreshold[i]= 0x7F - threshold; 3301 c.mmxDcThreshold[i]= 0x7F - threshold;
3349 clipped-= yHistogram[white]; 3349 clipped-= yHistogram[white];
3350 } 3350 }
3351 3351
3352 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); 3352 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3353 3353
3354 #ifdef HAVE_MMX2 3354 #if HAVE_MMX2
3355 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); 3355 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3356 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; 3356 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3357 #else 3357 #else
3358 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); 3358 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3359 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; 3359 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3382 // From this point on it is guaranteed that we can read and write 16 lines downward 3382 // From this point on it is guaranteed that we can read and write 16 lines downward
3383 // finish 1 block before the next otherwise we might have a problem 3383 // finish 1 block before the next otherwise we might have a problem
3384 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3384 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3385 for(x=0; x<width; x+=BLOCK_SIZE){ 3385 for(x=0; x<width; x+=BLOCK_SIZE){
3386 3386
3387 #ifdef HAVE_MMX2 3387 #if HAVE_MMX2
3388 /* 3388 /*
3389 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3389 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3390 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3390 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3391 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3391 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3392 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3392 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3411 : "%"REG_a, "%"REG_d 3411 : "%"REG_a, "%"REG_d
3412 ); 3412 );
3413 3413
3414 #elif defined(HAVE_3DNOW) 3414 #elif HAVE_3DNOW
3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3455 3455
3456 for(y=0; y<height; y+=BLOCK_SIZE){ 3456 for(y=0; y<height; y+=BLOCK_SIZE){
3457 //1% speedup if these are here instead of the inner loop 3457 //1% speedup if these are here instead of the inner loop
3458 const uint8_t *srcBlock= &(src[y*srcStride]); 3458 const uint8_t *srcBlock= &(src[y*srcStride]);
3459 uint8_t *dstBlock= &(dst[y*dstStride]); 3459 uint8_t *dstBlock= &(dst[y*dstStride]);
3460 #ifdef HAVE_MMX 3460 #if HAVE_MMX
3461 uint8_t *tempBlock1= c.tempBlocks; 3461 uint8_t *tempBlock1= c.tempBlocks;
3462 uint8_t *tempBlock2= c.tempBlocks + 8; 3462 uint8_t *tempBlock2= c.tempBlocks + 8;
3463 #endif 3463 #endif
3464 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; 3464 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3465 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; 3465 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3491 // From this point on it is guaranteed that we can read and write 16 lines downward 3491 // From this point on it is guaranteed that we can read and write 16 lines downward
3492 // finish 1 block before the next otherwise we might have a problem 3492 // finish 1 block before the next otherwise we might have a problem
3493 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 3493 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3494 for(x=0; x<width; x+=BLOCK_SIZE){ 3494 for(x=0; x<width; x+=BLOCK_SIZE){
3495 const int stride= dstStride; 3495 const int stride= dstStride;
3496 #ifdef HAVE_MMX 3496 #if HAVE_MMX
3497 uint8_t *tmpXchg; 3497 uint8_t *tmpXchg;
3498 #endif 3498 #endif
3499 if(isColor){ 3499 if(isColor){
3500 QP= QPptr[x>>qpHShift]; 3500 QP= QPptr[x>>qpHShift];
3501 c.nonBQP= nonBQPptr[x>>qpHShift]; 3501 c.nonBQP= nonBQPptr[x>>qpHShift];
3505 c.nonBQP= nonBQPptr[x>>4]; 3505 c.nonBQP= nonBQPptr[x>>4];
3506 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; 3506 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3507 yHistogram[ srcBlock[srcStride*12 + 4] ]++; 3507 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3508 } 3508 }
3509 c.QP= QP; 3509 c.QP= QP;
3510 #ifdef HAVE_MMX 3510 #if HAVE_MMX
3511 __asm__ volatile( 3511 __asm__ volatile(
3512 "movd %1, %%mm7 \n\t" 3512 "movd %1, %%mm7 \n\t"
3513 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 3513 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3514 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 3514 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3515 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 3515 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3518 : "r" (QP) 3518 : "r" (QP)
3519 ); 3519 );
3520 #endif 3520 #endif
3521 3521
3522 3522
3523 #ifdef HAVE_MMX2 3523 #if HAVE_MMX2
3524 /* 3524 /*
3525 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 3525 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3526 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 3526 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3527 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 3527 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3528 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 3528 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
3547 : "%"REG_a, "%"REG_d 3547 : "%"REG_a, "%"REG_d
3548 ); 3548 );
3549 3549
3550 #elif defined(HAVE_3DNOW) 3550 #elif HAVE_3DNOW
3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3589 }else if(mode & V_A_DEBLOCK){ 3589 }else if(mode & V_A_DEBLOCK){
3590 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); 3590 RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3591 } 3591 }
3592 } 3592 }
3593 3593
3594 #ifdef HAVE_MMX 3594 #if HAVE_MMX
3595 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 3595 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3596 #endif 3596 #endif
3597 /* check if we have a previous block to deblock it with dstBlock */ 3597 /* check if we have a previous block to deblock it with dstBlock */
3598 if(x - 8 >= 0){ 3598 if(x - 8 >= 0){
3599 #ifdef HAVE_MMX 3599 #if HAVE_MMX
3600 if(mode & H_X1_FILTER) 3600 if(mode & H_X1_FILTER)
3601 RENAME(vertX1Filter)(tempBlock1, 16, &c); 3601 RENAME(vertX1Filter)(tempBlock1, 16, &c);
3602 else if(mode & H_DEBLOCK){ 3602 else if(mode & H_DEBLOCK){
3603 //START_TIMER 3603 //START_TIMER
3604 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); 3604 const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3615 3615
3616 #else 3616 #else
3617 if(mode & H_X1_FILTER) 3617 if(mode & H_X1_FILTER)
3618 horizX1Filter(dstBlock-4, stride, QP); 3618 horizX1Filter(dstBlock-4, stride, QP);
3619 else if(mode & H_DEBLOCK){ 3619 else if(mode & H_DEBLOCK){
3620 #ifdef HAVE_ALTIVEC 3620 #if HAVE_ALTIVEC
3621 DECLARE_ALIGNED(16, unsigned char, tempBlock[272]); 3621 DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
3622 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); 3622 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3623 3623
3624 const int t=vertClassify_altivec(tempBlock-48, 16, &c); 3624 const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3625 if(t==1) { 3625 if(t==1) {
3657 } 3657 }
3658 3658
3659 dstBlock+=8; 3659 dstBlock+=8;
3660 srcBlock+=8; 3660 srcBlock+=8;
3661 3661
3662 #ifdef HAVE_MMX 3662 #if HAVE_MMX
3663 tmpXchg= tempBlock1; 3663 tmpXchg= tempBlock1;
3664 tempBlock1= tempBlock2; 3664 tempBlock1= tempBlock2;
3665 tempBlock2 = tmpXchg; 3665 tempBlock2 = tmpXchg;
3666 #endif 3666 #endif
3667 } 3667 }
3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3698 + dstBlock[x +13*dstStride] 3698 + dstBlock[x +13*dstStride]
3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3700 }*/ 3700 }*/
3701 } 3701 }
3702 #ifdef HAVE_3DNOW 3702 #if HAVE_3DNOW
3703 __asm__ volatile("femms"); 3703 __asm__ volatile("femms");
3704 #elif defined (HAVE_MMX) 3704 #elif HAVE_MMX
3705 __asm__ volatile("emms"); 3705 __asm__ volatile("emms");
3706 #endif 3706 #endif
3707 3707
3708 #ifdef DEBUG_BRIGHTNESS 3708 #ifdef DEBUG_BRIGHTNESS
3709 if(!isColor){ 3709 if(!isColor){