annotate postproc/postprocess_template.c @ 7974:db1f16543379

enable volume filter and fix nonsense default volume (still not usable because mixer.c has no mechanism to pass volume commands to libaf)
author rfelker
date Wed, 30 Oct 2002 04:11:26 +0000
parents 0a5d69e6f2a2
children a57c1fc0c2fc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1 /*
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
4 This program is free software; you can redistribute it and/or modify
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
5 it under the terms of the GNU General Public License as published by
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
6 the Free Software Foundation; either version 2 of the License, or
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
7 (at your option) any later version.
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
8
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
9 This program is distributed in the hope that it will be useful,
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
12 GNU General Public License for more details.
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
13
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
14 You should have received a copy of the GNU General Public License
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
15 along with this program; if not, write to the Free Software
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
17 */
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
18
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
19 #undef PAVGB
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
20 #undef PMINUB
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
21 #undef PMAXUB
2189
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
22
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
23 #ifdef HAVE_MMX2
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
24 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
25 #elif defined (HAVE_3DNOW)
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
26 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
27 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
28
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
29 #ifdef HAVE_MMX2
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
30 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
31 #elif defined (HAVE_MMX)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
32 #define PMINUB(b,a,t) \
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
33 "movq " #a ", " #t " \n\t"\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
34 "psubusb " #b ", " #t " \n\t"\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
35 "psubb " #t ", " #a " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
36 #endif
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
37
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
38 #ifdef HAVE_MMX2
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
39 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
40 #elif defined (HAVE_MMX)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
41 #define PMAXUB(a,b) \
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
42 "psubusb " #a ", " #b " \n\t"\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
43 "paddb " #a ", " #b " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
44 #endif
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
45
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
46
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
47 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
48 #ifdef HAVE_MMX
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
49 /**
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
50 * Check if the middle 8x8 Block in the given 8x16 block is flat
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
51 */
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
52 static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
53 int numEq= 0;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
54 src+= stride*4; // src points to begin of the 8x8 Block
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
55 asm volatile(
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
56 "leal (%1, %2), %%eax \n\t"
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
57 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
58 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
59 "movq %3, %%mm7 \n\t"
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
60 "movq %4, %%mm6 \n\t"
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
61
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
62 "movq (%1), %%mm0 \n\t"
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
63 "movq (%%eax), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
64 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
65 "paddb %%mm7, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
66 "pcmpgtb %%mm6, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
67
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
68 "movq (%%eax,%2), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
69 "psubb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
70 "paddb %%mm7, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
71 "pcmpgtb %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
72 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
73
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
74 "movq (%%eax, %2, 2), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
75 "psubb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
76 "paddb %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
77 "pcmpgtb %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
78 "paddb %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
79
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
80 "leal (%%eax, %2, 4), %%eax \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
81
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
82 "movq (%1, %2, 4), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
83 "psubb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
84 "paddb %%mm7, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
85 "pcmpgtb %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
86 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
87
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
88 "movq (%%eax), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
89 "psubb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
90 "paddb %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
91 "pcmpgtb %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
92 "paddb %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
93
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
94 "movq (%%eax, %2), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
95 "psubb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
96 "paddb %%mm7, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
97 "pcmpgtb %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
98 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
99
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
100 "movq (%%eax, %2, 2), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
101 "psubb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
102 "paddb %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
103 "pcmpgtb %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
104 "paddb %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
105
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
106 " \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
107 #ifdef HAVE_MMX2
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
108 "pxor %%mm7, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
109 "psadbw %%mm7, %%mm0 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
110 #else
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
111 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
112 "psrlw $8, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
113 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
114 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
115 "psrlq $16, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
116 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
117 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
118 "psrlq $32, %%mm0 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
119 "paddb %%mm1, %%mm0 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
120 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
121 "movd %%mm0, %0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
122 : "=r" (numEq)
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
123 : "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
124 : "%eax"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
125 );
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
126 numEq= (-numEq) &0xFF;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
127 return numEq > c->ppMode.flatnessThreshold;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
128 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
129 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
130
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
131 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
132 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
133 #ifdef HAVE_MMX
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
134 int isOk;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
135 src+= stride*3;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
136 asm volatile(
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
137 "movq (%1, %2), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
138 "movq (%1, %2, 8), %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
139 "movq %%mm0, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
140 "psubusb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
141 "psubusb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
142 "por %%mm1, %%mm0 \n\t" // ABS Diff
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
143
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
144 "movq %3, %%mm7 \n\t" // QP,..., QP
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
145 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
146 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
147 "packssdw %%mm0, %%mm0 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
148 "movd %%mm0, %0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
149 : "=r" (isOk)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
150 : "r" (src), "r" (stride), "m" (c->pQPb)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
151 );
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
152 return isOk==0;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
153 #else
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
154 #if 1
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
155 int x;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
156 const int QP= c->QP;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
157 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
158 for(x=0; x<BLOCK_SIZE; x++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
159 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
160 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
161 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
162
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
163 return 1;
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
164 #else
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
165 int x;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
166 const int QP= c->QP;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
167 src+= stride*4;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
168 for(x=0; x<BLOCK_SIZE; x++)
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
169 {
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
170 int min=255;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
171 int max=0;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
172 int y;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
173 for(y=0; y<8; y++){
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
174 int v= src[x + y*stride];
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
175 if(v>max) max=v;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
176 if(v<min) min=v;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
177 }
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
178 if(max-min > 2*QP) return 0;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
179 }
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
180 return 1;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
181 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
182 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
183 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
184
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
185 /**
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
186 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
187 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
188 */
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
189 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
190 {
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
191 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
192 src+= stride*3;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
193 asm volatile( //"movv %0 %1 %2\n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
194 "movq %2, %%mm0 \n\t" // QP,..., QP
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
195 "pxor %%mm4, %%mm4 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
196
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
197 "movq (%0), %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
198 "movq (%0, %1), %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
199 "movq %%mm5, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
200 "movq %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
201 "psubusb %%mm6, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
202 "psubusb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
203 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
204 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
205 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
206
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
207 "pand %%mm2, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
208 "pandn %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
209 "por %%mm2, %%mm6 \n\t"// First Line to Filter
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
210
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
211 "movq (%0, %1, 8), %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
212 "leal (%0, %1, 4), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
213 "leal (%0, %1, 8), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
214 "subl %1, %%ecx \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
215 "addl %1, %0 \n\t" // %0 points to line 1 not 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
216 "movq (%0, %1, 8), %%mm7 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
217 "movq %%mm5, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
218 "movq %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
219 "psubusb %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
220 "psubusb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
221 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
222 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
223 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
224
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
225 "pand %%mm2, %%mm7 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
226 "pandn %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
227 "por %%mm2, %%mm7 \n\t" // First Line to Filter
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
228
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
229
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
230 // 1 2 3 4 5 6 7 8
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
231 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
232 // 6 4 2 2 1 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
233 // 6 4 4 2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
234 // 6 8 2
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
235
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
236 "movq (%0, %1), %%mm0 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
237 "movq %%mm0, %%mm1 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
238 PAVGB(%%mm6, %%mm0) //1 1 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
239 PAVGB(%%mm6, %%mm0) //3 1 /4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
240
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
241 "movq (%0, %1, 4), %%mm2 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
242 "movq %%mm2, %%mm5 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
243 PAVGB((%%eax), %%mm2) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
244 PAVGB((%0, %1, 2), %%mm2) // 211 /4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
245 "movq %%mm2, %%mm3 \n\t" // 211 /4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
246 "movq (%0), %%mm4 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
247 PAVGB(%%mm4, %%mm3) // 4 211 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
248 PAVGB(%%mm0, %%mm3) //642211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
249 "movq %%mm3, (%0) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
250 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
251 "movq %%mm1, %%mm0 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
252 PAVGB(%%mm6, %%mm0) //1 1 /2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
253 "movq %%mm4, %%mm3 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
254 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
255 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
256 PAVGB((%%eax), %%mm5) // 211 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
257 PAVGB(%%mm5, %%mm3) // 2 2211 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
258 PAVGB(%%mm0, %%mm3) //4242211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
259 "movq %%mm3, (%0,%1) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
260 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
261 PAVGB(%%mm4, %%mm6) //11 /2
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
262 "movq (%%ecx), %%mm0 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
263 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
264 "movq %%mm0, %%mm3 \n\t" // 11/2
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
265 PAVGB(%%mm1, %%mm0) // 2 11/4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
266 PAVGB(%%mm6, %%mm0) //222 11/8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
267 PAVGB(%%mm2, %%mm0) //22242211/16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
268 "movq (%0, %1, 2), %%mm2 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
269 "movq %%mm0, (%0, %1, 2) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
270 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
271 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
272 PAVGB((%%ecx), %%mm0) // 11 /2
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
273 PAVGB(%%mm0, %%mm6) //11 11 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
274 PAVGB(%%mm1, %%mm4) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
275 PAVGB(%%mm2, %%mm1) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
276 PAVGB(%%mm1, %%mm6) //1122 11 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
277 PAVGB(%%mm5, %%mm6) //112242211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
278 "movq (%%eax), %%mm5 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
279 "movq %%mm6, (%%eax) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
280 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
281 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
282 PAVGB(%%mm7, %%mm6) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
283 PAVGB(%%mm4, %%mm6) // 11 11 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
284 PAVGB(%%mm3, %%mm6) // 11 2211 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
285 PAVGB(%%mm5, %%mm2) // 11 /2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
286 "movq (%0, %1, 4), %%mm4 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
287 PAVGB(%%mm4, %%mm2) // 112 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
288 PAVGB(%%mm2, %%mm6) // 112242211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
289 "movq %%mm6, (%0, %1, 4) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
290 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
291 PAVGB(%%mm7, %%mm1) // 11 2 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
292 PAVGB(%%mm4, %%mm5) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
293 PAVGB(%%mm5, %%mm0) // 11 11 /4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
294 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
295 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
296 PAVGB(%%mm0, %%mm1) // 11224222 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
297 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
298 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
299 PAVGB((%%ecx), %%mm2) // 112 4 /8
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
300 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
301 PAVGB(%%mm0, %%mm6) // 1 1 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
302 PAVGB(%%mm7, %%mm6) // 1 12 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
303 PAVGB(%%mm2, %%mm6) // 1122424 /4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
304 "movq %%mm6, (%%ecx) \n\t" // X
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
305 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
306 PAVGB(%%mm7, %%mm5) // 11 2 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
307 PAVGB(%%mm7, %%mm5) // 11 6 /8
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
308
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
309 PAVGB(%%mm3, %%mm0) // 112 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
310 PAVGB(%%mm0, %%mm5) // 112246 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
311 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
312 "subl %1, %0 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
313
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
314 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
315 : "r" (src), "r" (stride), "m" (c->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
316 : "%eax", "%ecx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
317 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
318 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
319 const int l1= stride;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
320 const int l2= stride + l1;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
321 const int l3= stride + l2;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
322 const int l4= stride + l3;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
323 const int l5= stride + l4;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
324 const int l6= stride + l5;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
325 const int l7= stride + l6;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
326 const int l8= stride + l7;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
327 const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
328 int x;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
329 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
330 for(x=0; x<BLOCK_SIZE; x++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
331 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
332 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
333 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
334
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
335 int sums[9];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
336 sums[0] = first + src[l1];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
337 sums[1] = src[l1] + src[l2];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
338 sums[2] = src[l2] + src[l3];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
339 sums[3] = src[l3] + src[l4];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
340 sums[4] = src[l4] + src[l5];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
341 sums[5] = src[l5] + src[l6];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
342 sums[6] = src[l6] + src[l7];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
343 sums[7] = src[l7] + src[l8];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
344 sums[8] = src[l8] + last;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
345
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
346 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
347 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
348 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
349 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
350 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
351 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
352 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
353 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
354
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
355 src++;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
356 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
357 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
358 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
359
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
360 #if 0
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
361 /**
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
362 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
363 * values are correctly clipped (MMX2)
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
364 * values are wraparound (C)
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
365 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
366 0 8 16 24
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
367 x = 8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
368 x/2 = 4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
369 x/8 = 1
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
370 1 12 12 23
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
371 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
372 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
373 {
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
374 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
375 src+= stride*3;
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
376 // FIXME rounding
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
377 asm volatile(
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
378 "pxor %%mm7, %%mm7 \n\t" // 0
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
379 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
380 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
381 "leal (%%eax, %1, 4), %%ecx \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
382 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
383 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
384 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
385 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
386 "paddusb "MANGLE(b02)", %%mm0 \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
387 "psrlw $2, %%mm0 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
388 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
389 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
390 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
391 "movq (%%ecx), %%mm3 \n\t" // line 5
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
392 "movq %%mm2, %%mm4 \n\t" // line 4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
393 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
394 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
395 PAVGB(%%mm3, %%mm5)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
396 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
397 "psubusb %%mm3, %%mm4 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
398 "psubusb %%mm2, %%mm3 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
399 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
400 "psubusb %%mm0, %%mm4 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
401 "pcmpeqb %%mm7, %%mm4 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
402 "pand %%mm4, %%mm5 \n\t" // d/2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
403
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
404 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
405 "paddb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
406 // "psubb %%mm6, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
407 "movq %%mm2, (%0,%1, 4) \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
408
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
409 "movq (%%ecx), %%mm2 \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
410 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
411 "psubb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
412 // "psubb %%mm6, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
413 "movq %%mm2, (%%ecx) \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
414
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
415 "paddb %%mm6, %%mm5 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
416 "psrlw $2, %%mm5 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
417 "pand "MANGLE(b3F)", %%mm5 \n\t"
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
418 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
419
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
420 "movq (%%eax, %1, 2), %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
421 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
422 "paddsb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
423 "psubb %%mm6, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
424 "movq %%mm2, (%%eax, %1, 2) \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
425
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
426 "movq (%%ecx, %1), %%mm2 \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
427 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
428 "psubsb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
429 "psubb %%mm6, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
430 "movq %%mm2, (%%ecx, %1) \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
431
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
432 :
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
433 : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
434 : "%eax", "%ecx"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
435 );
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
436 #else
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
437 const int l1= stride;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
438 const int l2= stride + l1;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
439 const int l3= stride + l2;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
440 const int l4= stride + l3;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
441 const int l5= stride + l4;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
442 const int l6= stride + l5;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
443 // const int l7= stride + l6;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
444 // const int l8= stride + l7;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
445 // const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
446 int x;
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
447 const int QP15= QP + (QP>>2);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
448 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
449 for(x=0; x<BLOCK_SIZE; x++)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
450 {
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
451 const int v = (src[x+l5] - src[x+l4]);
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
452 if(ABS(v) < QP15)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
453 {
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
454 src[x+l3] +=v>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
455 src[x+l4] +=v>>1;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
456 src[x+l5] -=v>>1;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
457 src[x+l6] -=v>>3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
458
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
459 }
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
460 }
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
461
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
462 #endif
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
463 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
464 #endif
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
465
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
466 /**
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
467 * Experimental Filter 1
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
468 * will not damage linear gradients
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
469 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
470 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
471 * MMX2 version does correct clipping C version doesnt
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
472 */
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
473 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
474 {
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
475 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
476 src+= stride*3;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
477
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
478 asm volatile(
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
479 "pxor %%mm7, %%mm7 \n\t" // 0
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
480 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
481 "leal (%%eax, %1, 4), %%ecx \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
482 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
483 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
484 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
485 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
486 "movq %%mm1, %%mm2 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
487 "psubusb %%mm0, %%mm1 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
488 "psubusb %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
489 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
490 "movq (%%ecx), %%mm3 \n\t" // line 5
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
491 "movq (%%ecx, %1), %%mm4 \n\t" // line 6
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
492 "movq %%mm3, %%mm5 \n\t" // line 5
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
493 "psubusb %%mm4, %%mm3 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
494 "psubusb %%mm5, %%mm4 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
495 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
496 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
497 "movq %%mm2, %%mm1 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
498 "psubusb %%mm5, %%mm2 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
499 "movq %%mm2, %%mm4 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
500 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
501 "psubusb %%mm1, %%mm5 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
502 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
503 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
504 "movq %%mm4, %%mm3 \n\t" // d
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
505 "movq %2, %%mm0 \n\t"
5787
5c36f7890b53 x1 deblocking filter bugfix
michael
parents: 4403
diff changeset
506 "paddusb %%mm0, %%mm0 \n\t"
5c36f7890b53 x1 deblocking filter bugfix
michael
parents: 4403
diff changeset
507 "psubusb %%mm0, %%mm4 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
508 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
509 "psubusb "MANGLE(b01)", %%mm3 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
510 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
511
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
512 PAVGB(%%mm7, %%mm3) // d/2
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
513 "movq %%mm3, %%mm1 \n\t" // d/2
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
514 PAVGB(%%mm7, %%mm3) // d/4
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
515 PAVGB(%%mm1, %%mm3) // 3*d/8
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
516
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
517 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
518 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
519 "psubusb %%mm3, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
520 "pxor %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
521 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
522
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
523 "movq (%%ecx), %%mm0 \n\t" // line 5
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
524 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
525 "paddusb %%mm3, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
526 "pxor %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
527 "movq %%mm0, (%%ecx) \n\t" // line 5
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
528
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
529 PAVGB(%%mm7, %%mm1) // d/4
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
530
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
531 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
532 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
533 "psubusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
534 "pxor %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
535 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
536
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
537 "movq (%%ecx, %1), %%mm0 \n\t" // line 6
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
538 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
539 "paddusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
540 "pxor %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
541 "movq %%mm0, (%%ecx, %1) \n\t" // line 6
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
542
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
543 PAVGB(%%mm7, %%mm1) // d/8
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
544
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
545 "movq (%%eax, %1), %%mm0 \n\t" // line 2
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
546 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
547 "psubusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
548 "pxor %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
549 "movq %%mm0, (%%eax, %1) \n\t" // line 2
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
550
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
551 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
552 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
553 "paddusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
554 "pxor %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
555 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
556
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
557 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
558 : "r" (src), "r" (stride), "m" (co->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
559 : "%eax", "%ecx"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
560 );
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
561 #else
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
562
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
563 const int l1= stride;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
564 const int l2= stride + l1;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
565 const int l3= stride + l2;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
566 const int l4= stride + l3;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
567 const int l5= stride + l4;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
568 const int l6= stride + l5;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
569 const int l7= stride + l6;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
570 // const int l8= stride + l7;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
571 // const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
572 int x;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
573
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
574 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
575 for(x=0; x<BLOCK_SIZE; x++)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
576 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
577 int a= src[l3] - src[l4];
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
578 int b= src[l4] - src[l5];
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
579 int c= src[l5] - src[l6];
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
580
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
581 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
582 d= MAX(d, 0);
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
583
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
584 if(d < co->QP*2)
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
585 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
586 int v = d * SIGN(-b);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
587
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
588 src[l2] +=v>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
589 src[l3] +=v>>2;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
590 src[l4] +=(3*v)>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
591 src[l5] -=(3*v)>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
592 src[l6] -=v>>2;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
593 src[l7] -=v>>3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
594
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
595 }
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
596 src++;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
597 }
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
598 #endif
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
599 }
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
600
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
601 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
602 {
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
603 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
604 /*
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
605 uint8_t tmp[16];
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
606 const int l1= stride;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
607 const int l2= stride + l1;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
608 const int l3= stride + l2;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
609 const int l4= (int)tmp - (int)src - stride*3;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
610 const int l5= (int)tmp - (int)src - stride*3 + 8;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
611 const int l6= stride*3 + l3;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
612 const int l7= stride + l6;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
613 const int l8= stride + l7;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
614
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
615 memcpy(tmp, src+stride*7, 8);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
616 memcpy(tmp+8, src+stride*8, 8);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
617 */
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
618 src+= stride*4;
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
619 asm volatile(
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
620
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
621 #if 0 //sligtly more accurate and slightly slower
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
622 "pxor %%mm7, %%mm7 \n\t" // 0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
623 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
624 "leal (%%eax, %1, 4), %%ecx \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
625 // 0 1 2 3 4 5 6 7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
626 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
627 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
628
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
629
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
630 "movq (%0, %1, 2), %%mm0 \n\t" // l2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
631 "movq (%0), %%mm1 \n\t" // l0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
632 "movq %%mm0, %%mm2 \n\t" // l2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
633 PAVGB(%%mm7, %%mm0) // ~l2/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
634 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
635 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
636
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
637 "movq (%%eax), %%mm1 \n\t" // l1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
638 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
639 "movq %%mm1, %%mm4 \n\t" // l1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
640 PAVGB(%%mm7, %%mm1) // ~l1/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
641 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
642 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
643
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
644 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
645 "psubusb %%mm1, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
646 "psubusb %%mm4, %%mm1 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
647 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
648 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
649
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
650 "movq (%0, %1, 4), %%mm0 \n\t" // l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
651 "movq %%mm0, %%mm4 \n\t" // l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
652 PAVGB(%%mm7, %%mm0) // ~l4/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
653 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
654 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
655
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
656 "movq (%%ecx), %%mm2 \n\t" // l5
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
657 "movq %%mm3, %%mm5 \n\t" // l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
658 PAVGB(%%mm7, %%mm3) // ~l3/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
659 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
660 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
661
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
662 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
663 "psubusb %%mm3, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
664 "psubusb %%mm6, %%mm3 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
665 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
666 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
667 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
668
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
669 "movq (%%ecx, %1), %%mm6 \n\t" // l6
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
670 "movq %%mm6, %%mm5 \n\t" // l6
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
671 PAVGB(%%mm7, %%mm6) // ~l6/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
672 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
673 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
674
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
675 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
676 "movq %%mm2, %%mm4 \n\t" // l5
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
677 PAVGB(%%mm7, %%mm2) // ~l5/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
678 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
679 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
680
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
681 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
682 "psubusb %%mm2, %%mm6 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
683 "psubusb %%mm4, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
684 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
685 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
686
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
687
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
688 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
689 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
690 "paddusb "MANGLE(b01)", %%mm4 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
691 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
692 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
693 "pand %%mm4, %%mm3 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
694
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
695 "movq %%mm3, %%mm1 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
696 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
697 PAVGB(%%mm7, %%mm3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
698 PAVGB(%%mm7, %%mm3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
699 "paddusb %%mm1, %%mm3 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
700 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
701
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
702 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
703 "movq (%0, %1, 4), %%mm5 \n\t" //l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
704 "movq (%0, %1, 4), %%mm4 \n\t" //l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
705 "psubusb %%mm6, %%mm5 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
706 "psubusb %%mm4, %%mm6 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
707 "por %%mm6, %%mm5 \n\t" // |l3-l4|
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
708 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
709 "pxor %%mm6, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
710 "pand %%mm0, %%mm3 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
711 PMINUB(%%mm5, %%mm3, %%mm0)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
712
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
713 "psubusb "MANGLE(b01)", %%mm3 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
714 PAVGB(%%mm7, %%mm3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
715
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
716 "movq (%%eax, %1, 2), %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
717 "movq (%0, %1, 4), %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
718 "pxor %%mm6, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
719 "pxor %%mm6, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
720 "psubb %%mm3, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
721 "paddb %%mm3, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
722 "pxor %%mm6, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
723 "pxor %%mm6, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
724 "movq %%mm0, (%%eax, %1, 2) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
725 "movq %%mm2, (%0, %1, 4) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
726 #endif
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
727
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
728 "leal (%0, %1), %%eax \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
729 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
730 // 0 1 2 3 4 5 6 7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
731 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
732 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
733
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
734
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
735 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
736 "movq (%0, %1, 4), %%mm0 \n\t" // l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
737 "pxor %%mm6, %%mm1 \n\t" // -l3-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
738 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
739 // mm1=-l3-1, mm0=128-q
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
740
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
741 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
742 "movq (%%eax, %1), %%mm3 \n\t" // l2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
743 "pxor %%mm6, %%mm2 \n\t" // -l5-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
744 "movq %%mm2, %%mm5 \n\t" // -l5-1
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
745 "movq "MANGLE(b80)", %%mm4 \n\t" // 128
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
746 "leal (%%eax, %1, 4), %%ecx \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
747 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
748 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
749 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
750 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
751 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
752
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
753 "movq (%%eax), %%mm2 \n\t" // l1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
754 "pxor %%mm6, %%mm2 \n\t" // -l1-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
755 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
756 PAVGB((%0), %%mm1) // (l0-l3+256)/2
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
757 "movq "MANGLE(b80)", %%mm3 \n\t" // 128
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
758 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
759 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
760 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
761 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
762
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
763 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
764 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
765 "pxor %%mm6, %%mm1 \n\t" // -l7-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
766 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
767 "movq "MANGLE(b80)", %%mm2 \n\t" // 128
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
768 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
769 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
770 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
771 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
772
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
773 "movq "MANGLE(b00)", %%mm1 \n\t" // 0
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
774 "movq "MANGLE(b00)", %%mm5 \n\t" // 0
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
775 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
776 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
777 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
778 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
779 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
780
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
781 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
782
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
783 "movq "MANGLE(b00)", %%mm7 \n\t" // 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
784 "movq %2, %%mm2 \n\t" // QP
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
785 PAVGB(%%mm6, %%mm2) // 128 + QP/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
786 "psubb %%mm6, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
787
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
788 "movq %%mm4, %%mm1 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
789 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
790 "pxor %%mm1, %%mm4 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
791 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
792 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
793 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
794 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
795
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
796 "movq %%mm4, %%mm3 \n\t" // d
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
797 "psubusb "MANGLE(b01)", %%mm4 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
798 PAVGB(%%mm7, %%mm4) // d/32
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
799 PAVGB(%%mm7, %%mm4) // (d + 32)/64
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
800 "paddb %%mm3, %%mm4 \n\t" // 5d/64
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
801 "pand %%mm2, %%mm4 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
802
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
803 "movq "MANGLE(b80)", %%mm5 \n\t" // 128
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
804 "psubb %%mm0, %%mm5 \n\t" // q
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
805 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
806 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
807 "pxor %%mm7, %%mm5 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
808
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
809 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
810 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
811
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
812 "pand %%mm7, %%mm4 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
813 "movq (%%eax, %1, 2), %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
814 "movq (%0, %1, 4), %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
815 "pxor %%mm1, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
816 "pxor %%mm1, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
817 "paddb %%mm4, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
818 "psubb %%mm4, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
819 "pxor %%mm1, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
820 "pxor %%mm1, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
821 "movq %%mm0, (%%eax, %1, 2) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
822 "movq %%mm2, (%0, %1, 4) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
823
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
824 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
825 : "r" (src), "r" (stride), "m" (c->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
826 : "%eax", "%ecx"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
827 );
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
828
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
829 /*
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
830 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
831 int x;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
832 src-= stride;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
833 for(x=0; x<BLOCK_SIZE; x++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
834 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
835 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
836 if(ABS(middleEnergy)< 8*QP)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
837 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
838 const int q=(src[l4] - src[l5])/2;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
839 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
840 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
841
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
842 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
843 d= MAX(d, 0);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
844
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
845 d= (5*d + 32) >> 6;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
846 d*= SIGN(-middleEnergy);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
847
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
848 if(q>0)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
849 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
850 d= d<0 ? 0 : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
851 d= d>q ? q : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
852 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
853 else
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
854 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
855 d= d>0 ? 0 : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
856 d= d<q ? q : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
857 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
858
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
859 src[l4]-= d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
860 src[l5]+= d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
861 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
862 src++;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
863 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
864 src-=8;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
865 for(x=0; x<8; x++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
866 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
867 int y;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
868 for(y=4; y<6; y++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
869 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
870 int d= src[x+y*stride] - tmp[x+(y-4)*8];
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
871 int ad= ABS(d);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
872 static int max=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
873 static int sum=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
874 static int num=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
875 static int bias=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
876
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
877 if(max<ad) max=ad;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
878 sum+= ad>3 ? 1 : 0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
879 if(ad>3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
880 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
881 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
882 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
883 if(y==4) bias+=d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
884 num++;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
885 if(num%1000000 == 0)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
886 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
887 printf(" %d %d %d %d\n", num, sum, max, bias);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
888 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
889 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
890 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
891 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
892 */
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
893 #elif defined (HAVE_MMX)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
894 src+= stride*4;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
895
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
896 asm volatile(
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
897 "pxor %%mm7, %%mm7 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
898 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
899 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
900 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
901 "andl $0xFFFFFFF8, %%ecx \n\t" // align
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
902 // 0 1 2 3 4 5 6 7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
903 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
904 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
905
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
906 "movq (%0), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
907 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
908 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
909 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
910
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
911 "movq (%%eax), %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
912 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
913 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
914 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
915
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
916 "movq (%%eax, %1), %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
917 "movq %%mm4, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
918 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
919 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
920
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
921 "paddw %%mm0, %%mm0 \n\t" // 2L0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
922 "paddw %%mm1, %%mm1 \n\t" // 2H0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
923 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
924 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
925 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
926 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
927
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
928 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
929 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
930 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
931 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
932
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
933 "movq (%%eax, %1, 2), %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
934 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
935 "punpcklbw %%mm7, %%mm2 \n\t" // L3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
936 "punpckhbw %%mm7, %%mm3 \n\t" // H3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
937
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
940 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
941 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
942 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
943 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
944
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
945 "movq (%0, %1, 4), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
946 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
947 "punpcklbw %%mm7, %%mm0 \n\t" // L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
948 "punpckhbw %%mm7, %%mm1 \n\t" // H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
949
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
950 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
951 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
952 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
953 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
954 "paddw %%mm4, %%mm4 \n\t" // 2L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
955 "paddw %%mm5, %%mm5 \n\t" // 2H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
956 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
957 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
958
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
959 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
960 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
961 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
962 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
963 //50 opcodes so far
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
964 "movq (%%edx), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
965 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
966 "punpcklbw %%mm7, %%mm2 \n\t" // L5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
967 "punpckhbw %%mm7, %%mm3 \n\t" // H5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
968 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
969 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
972
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
973 "movq (%%edx, %1), %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
974 "punpcklbw %%mm7, %%mm6 \n\t" // L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
975 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
976 "movq (%%edx, %1), %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
977 "punpckhbw %%mm7, %%mm6 \n\t" // H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
978 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
979
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
980 "paddw %%mm0, %%mm0 \n\t" // 2L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
981 "paddw %%mm1, %%mm1 \n\t" // 2H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
982 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
983 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
984
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
985 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
986 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
987 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
988 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
989
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
990 "movq (%%edx, %1, 2), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
991 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
992 "punpcklbw %%mm7, %%mm2 \n\t" // L7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
993 "punpckhbw %%mm7, %%mm3 \n\t" // H7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
994
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
995 "paddw %%mm2, %%mm2 \n\t" // 2L7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
996 "paddw %%mm3, %%mm3 \n\t" // 2H7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
997 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
998 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
999
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1000 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1001 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1002
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1003 #ifdef HAVE_MMX2
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1004 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1005 "psubw %%mm0, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1006 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1007 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1008 "psubw %%mm1, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1009 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1010 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1011 "psubw %%mm2, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1012 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1013 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1014 "psubw %%mm3, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1015 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1016 #else
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1017 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1018 "pcmpgtw %%mm0, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1019 "pxor %%mm6, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1020 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1021 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1022 "pcmpgtw %%mm1, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1023 "pxor %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1024 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1025 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1026 "pcmpgtw %%mm2, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1027 "pxor %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1028 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1029 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1030 "pcmpgtw %%mm3, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1031 "pxor %%mm6, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1032 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1033 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1034
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1035 #ifdef HAVE_MMX2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1036 "pminsw %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1037 "pminsw %%mm3, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1038 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1039 "movq %%mm0, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1040 "psubusw %%mm2, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1041 "psubw %%mm6, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1042 "movq %%mm1, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1043 "psubusw %%mm3, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1044 "psubw %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1045 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1046
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1047 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1048 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1049 "pxor %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1050 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1051 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1052 "pxor %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1053 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1054 // 100 opcodes
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1055 "movd %2, %%mm2 \n\t" // QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1056 "psllw $3, %%mm2 \n\t" // 8QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1057 "movq %%mm2, %%mm3 \n\t" // 8QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1058 "pcmpgtw %%mm4, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1059 "pcmpgtw %%mm5, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1060 "pand %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1061 "pand %%mm3, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1062
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1063
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1064 "psubusw %%mm0, %%mm4 \n\t" // hd
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1065 "psubusw %%mm1, %%mm5 \n\t" // ld
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1066
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1067
4253
4b39bde9f7ad fix mangling with runtime cpu detection
atmos4
parents: 4248
diff changeset
1068 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1069 "pmullw %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1070 "pmullw %%mm2, %%mm5 \n\t"
4253
4b39bde9f7ad fix mangling with runtime cpu detection
atmos4
parents: 4248
diff changeset
1071 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1072 "paddw %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1073 "paddw %%mm2, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1074 "psrlw $6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1075 "psrlw $6, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1076
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1077 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1078 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1079
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1080 "pxor %%mm2, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1081 "pxor %%mm3, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1082
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1083 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1084 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1085 "pxor %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1086 "pxor %%mm3, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1087 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1088 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1089 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1090 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1091
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1092 "pxor %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1093 "pxor %%mm7, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1094 "pand %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1095 "pand %%mm3, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1096
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1097 #ifdef HAVE_MMX2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1098 "pminsw %%mm0, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1099 "pminsw %%mm1, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1100 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1101 "movq %%mm4, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1102 "psubusw %%mm0, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1103 "psubw %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1104 "movq %%mm5, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1105 "psubusw %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1106 "psubw %%mm2, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1107 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1108 "pxor %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1109 "pxor %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1110 "psubw %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1111 "psubw %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1112 "packsswb %%mm5, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1113 "movq (%%eax, %1, 2), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1114 "paddb %%mm4, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1115 "movq %%mm0, (%%eax, %1, 2) \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1116 "movq (%0, %1, 4), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1117 "psubb %%mm4, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1118 "movq %%mm0, (%0, %1, 4) \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1119
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1120 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1121 : "r" (src), "r" (stride), "m" (c->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1122 : "%eax", "%edx", "%ecx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1123 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1124 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1125 const int l1= stride;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1126 const int l2= stride + l1;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1127 const int l3= stride + l2;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1128 const int l4= stride + l3;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1129 const int l5= stride + l4;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1130 const int l6= stride + l5;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1131 const int l7= stride + l6;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1132 const int l8= stride + l7;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1133 // const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
1134 int x;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1135 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
1136 for(x=0; x<BLOCK_SIZE; x++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1137 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1138 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1139 if(ABS(middleEnergy) < 8*c->QP)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1140 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1141 const int q=(src[l4] - src[l5])/2;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1142 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1143 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1144
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1145 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1146 d= MAX(d, 0);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1147
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1148 d= (5*d + 32) >> 6;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1149 d*= SIGN(-middleEnergy);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1150
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1151 if(q>0)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1152 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1153 d= d<0 ? 0 : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1154 d= d>q ? q : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1155 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1156 else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1157 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1158 d= d>0 ? 0 : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1159 d= d<q ? q : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1160 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1161
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1162 src[l4]-= d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1163 src[l5]+= d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1164 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1165 src++;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1166 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1167 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1168 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1169
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1170 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1171 {
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1172 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1173 asm volatile(
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1174 "pxor %%mm6, %%mm6 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1175 "pcmpeqb %%mm7, %%mm7 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1176 "movq %2, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1177 "punpcklbw %%mm6, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1178 "psrlw $1, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1179 "psubw %%mm7, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1180 "packuswb %%mm0, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1181 "movq %%mm0, %3 \n\t"
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1182
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1183 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1184 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1185
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1186 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1187 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1188
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1189 #undef FIND_MIN_MAX
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1190 #ifdef HAVE_MMX2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1191 #define FIND_MIN_MAX(addr)\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1192 "movq " #addr ", %%mm0 \n\t"\
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1193 "pminub %%mm0, %%mm7 \n\t"\
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1194 "pmaxub %%mm0, %%mm6 \n\t"
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1195 #else
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1196 #define FIND_MIN_MAX(addr)\
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1197 "movq " #addr ", %%mm0 \n\t"\
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1198 "movq %%mm7, %%mm1 \n\t"\
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1199 "psubusb %%mm0, %%mm6 \n\t"\
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1200 "paddb %%mm0, %%mm6 \n\t"\
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1201 "psubusb %%mm0, %%mm1 \n\t"\
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1202 "psubb %%mm1, %%mm7 \n\t"
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1203 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1204
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1205 FIND_MIN_MAX((%%eax))
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1206 FIND_MIN_MAX((%%eax, %1))
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1207 FIND_MIN_MAX((%%eax, %1, 2))
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1208 FIND_MIN_MAX((%0, %1, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1209 FIND_MIN_MAX((%%edx))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1210 FIND_MIN_MAX((%%edx, %1))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1211 FIND_MIN_MAX((%%edx, %1, 2))
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1212 FIND_MIN_MAX((%0, %1, 8))
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1213
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1214 "movq %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1215 "psrlq $8, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1216 #ifdef HAVE_MMX2
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1217 "pminub %%mm4, %%mm7 \n\t" // min of pixels
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1218 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1219 "pminub %%mm4, %%mm7 \n\t" // min of pixels
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1220 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1221 "pminub %%mm4, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1222 #else
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1223 "movq %%mm7, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1224 "psubusb %%mm4, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1225 "psubb %%mm1, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1226 "movq %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1227 "psrlq $16, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1228 "movq %%mm7, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1229 "psubusb %%mm4, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1230 "psubb %%mm1, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1231 "movq %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1232 "psrlq $32, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1233 "movq %%mm7, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1234 "psubusb %%mm4, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1235 "psubb %%mm1, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1236 #endif
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1237
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1238
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1239 "movq %%mm6, %%mm4 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1240 "psrlq $8, %%mm6 \n\t"
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1241 #ifdef HAVE_MMX2
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1242 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1243 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1244 "pmaxub %%mm4, %%mm6 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1245 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1246 "pmaxub %%mm4, %%mm6 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1247 #else
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1248 "psubusb %%mm4, %%mm6 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1249 "paddb %%mm4, %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1250 "movq %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1251 "psrlq $16, %%mm6 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1252 "psubusb %%mm4, %%mm6 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1253 "paddb %%mm4, %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1254 "movq %%mm6, %%mm4 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1255 "psrlq $32, %%mm6 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1256 "psubusb %%mm4, %%mm6 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1257 "paddb %%mm4, %%mm6 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1258 #endif
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1259 "movq %%mm6, %%mm0 \n\t" // max
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1260 "psubb %%mm7, %%mm6 \n\t" // max - min
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1261 "movd %%mm6, %%ecx \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1262 "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1263 " jb 1f \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1264 "leal -24(%%esp), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1265 "andl $0xFFFFFFF8, %%ecx \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1266 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1267 "punpcklbw %%mm7, %%mm7 \n\t"
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1268 "punpcklbw %%mm7, %%mm7 \n\t"
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1269 "punpcklbw %%mm7, %%mm7 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1270 "movq %%mm7, (%%ecx) \n\t"
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1271
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1272 "movq (%0), %%mm0 \n\t" // L10
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1273 "movq %%mm0, %%mm1 \n\t" // L10
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1274 "movq %%mm0, %%mm2 \n\t" // L10
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1275 "psllq $8, %%mm1 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1276 "psrlq $8, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1277 "movd -4(%0), %%mm3 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1278 "movd 8(%0), %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1279 "psrlq $24, %%mm3 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1280 "psllq $56, %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1281 "por %%mm3, %%mm1 \n\t" // L00
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1282 "por %%mm4, %%mm2 \n\t" // L20
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1283 "movq %%mm1, %%mm3 \n\t" // L00
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1284 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1285 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1286 "psubusb %%mm7, %%mm0 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1287 "psubusb %%mm7, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1288 "psubusb %%mm7, %%mm3 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1289 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1290 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1291 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1292 "paddb %%mm2, %%mm0 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1293 "paddb %%mm3, %%mm0 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1294
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1295 "movq (%%eax), %%mm2 \n\t" // L11
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1296 "movq %%mm2, %%mm3 \n\t" // L11
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1297 "movq %%mm2, %%mm4 \n\t" // L11
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1298 "psllq $8, %%mm3 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1299 "psrlq $8, %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1300 "movd -4(%%eax), %%mm5 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1301 "movd 8(%%eax), %%mm6 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1302 "psrlq $24, %%mm5 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1303 "psllq $56, %%mm6 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1304 "por %%mm5, %%mm3 \n\t" // L01
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1305 "por %%mm6, %%mm4 \n\t" // L21
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1306 "movq %%mm3, %%mm5 \n\t" // L01
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1307 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1308 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1309 "psubusb %%mm7, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1310 "psubusb %%mm7, %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1311 "psubusb %%mm7, %%mm5 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1312 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1313 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1314 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1315 "paddb %%mm4, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1316 "paddb %%mm5, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1317 // 0, 2, 3, 1
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1318 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1319 "movq " #src ", " #sx " \n\t" /* src[0] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1320 "movq " #sx ", " #lx " \n\t" /* src[0] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1321 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1322 "psllq $8, " #lx " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1323 "psrlq $8, " #t0 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1324 "movd -4" #src ", " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1325 "psrlq $24, " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1326 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1327 "movd 8" #src ", " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1328 "psllq $56, " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1329 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1330 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1331 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1332 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
2478
42d5846eeb51 faster dering
michael
parents: 2477
diff changeset
1333 PAVGB(lx, pplx) \
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1334 "movq " #lx ", 8(%%ecx) \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1335 "movq (%%ecx), " #lx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1336 "psubusb " #lx ", " #t1 " \n\t"\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1337 "psubusb " #lx ", " #t0 " \n\t"\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1338 "psubusb " #lx ", " #sx " \n\t"\
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1339 "movq "MANGLE(b00)", " #lx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1340 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1341 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1342 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1343 "paddb " #t1 ", " #t0 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1344 "paddb " #t0 ", " #sx " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1345 \
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1346 PAVGB(plx, pplx) /* filtered */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1347 "movq " #dst ", " #t0 " \n\t" /* dst */\
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1348 "movq " #t0 ", " #t1 " \n\t" /* dst */\
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1349 "psubusb %3, " #t0 " \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1350 "paddusb %3, " #t1 " \n\t"\
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1351 PMAXUB(t0, pplx)\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1352 PMINUB(t1, pplx, t0)\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1353 "paddb " #sx ", " #ppsx " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1354 "paddb " #psx ", " #ppsx " \n\t"\
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1355 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1356 "pand "MANGLE(b08)", " #ppsx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1357 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1358 "pand " #ppsx ", " #pplx " \n\t"\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1359 "pandn " #dst ", " #ppsx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1360 "por " #pplx ", " #ppsx " \n\t"\
2478
42d5846eeb51 faster dering
michael
parents: 2477
diff changeset
1361 "movq " #ppsx ", " #dst " \n\t"\
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1362 "movq 8(%%ecx), " #lx " \n\t"
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1363
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1364 /*
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1365 0000000
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1366 1111111
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1367
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1368 1111110
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1369 1111101
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1370 1111100
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1371 1111011
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1372 1111010
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1373 1111001
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1374
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1375 1111000
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1376 1110111
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1377
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1378 */
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1379 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1380 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1381 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1382 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1383 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1384 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1385 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1386 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1387 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1388
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1389 "1: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1390 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1391 : "%eax", "%edx", "%ecx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1392 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1393 #else
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1394 int y;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1395 int min=255;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1396 int max=0;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1397 int avg;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1398 uint8_t *p;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1399 int s[10];
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1400 const int QP2= c->QP/2 + 1;
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1401
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1402 for(y=1; y<9; y++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1403 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1404 int x;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1405 p= src + stride*y;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1406 for(x=1; x<9; x++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1407 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1408 p++;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1409 if(*p > max) max= *p;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1410 if(*p < min) min= *p;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1411 }
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1412 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1413 avg= (min + max + 1)>>1;
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1414
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1415 if(max - min <deringThreshold) return;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1416
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1417 for(y=0; y<10; y++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1418 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1419 int t = 0;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1420
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1421 if(src[stride*y + 0] > avg) t+= 1;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1422 if(src[stride*y + 1] > avg) t+= 2;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1423 if(src[stride*y + 2] > avg) t+= 4;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1424 if(src[stride*y + 3] > avg) t+= 8;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1425 if(src[stride*y + 4] > avg) t+= 16;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1426 if(src[stride*y + 5] > avg) t+= 32;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1427 if(src[stride*y + 6] > avg) t+= 64;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1428 if(src[stride*y + 7] > avg) t+= 128;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1429 if(src[stride*y + 8] > avg) t+= 256;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1430 if(src[stride*y + 9] > avg) t+= 512;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1431
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1432 t |= (~t)<<16;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1433 t &= (t<<1) & (t>>1);
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1434 s[y] = t;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1435 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1436
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1437 for(y=1; y<9; y++)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1438 {
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1439 int t = s[y-1] & s[y] & s[y+1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1440 t|= t>>16;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1441 s[y-1]= t;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1442 }
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1443
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1444 for(y=1; y<9; y++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1445 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1446 int x;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1447 int t = s[y-1];
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1448
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1449 p= src + stride*y;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1450 for(x=1; x<9; x++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1451 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1452 p++;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1453 if(t & (1<<x))
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1454 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1455 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1456 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1457 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1458 f= (f + 8)>>4;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1459
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1460 #ifdef DEBUG_DERING_THRESHOLD
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1461 asm volatile("emms\n\t":);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1462 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1463 static long long numPixels=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1464 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1465 // if((max-min)<20 || (max-min)*QP<200)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1466 // if((max-min)*QP < 500)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1467 // if(max-min<QP/2)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1468 if(max-min < 20)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1469 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1470 static int numSkiped=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1471 static int errorSum=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1472 static int worstQP=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1473 static int worstRange=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1474 static int worstDiff=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1475 int diff= (f - *p);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1476 int absDiff= ABS(diff);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1477 int error= diff*diff;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1478
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1479 if(x==1 || x==8 || y==1 || y==8) continue;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1480
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1481 numSkiped++;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1482 if(absDiff > worstDiff)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1483 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1484 worstDiff= absDiff;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1485 worstQP= QP;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1486 worstRange= max-min;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1487 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1488 errorSum+= error;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1489
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1490 if(1024LL*1024LL*1024LL % numSkiped == 0)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1491 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1492 printf( "sum:%1.3f, skip:%d, wQP:%d, "
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1493 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1494 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1495 worstDiff, (float)numSkiped/numPixels);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1496 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1497 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1498 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1499 #endif
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1500 if (*p + QP2 < f) *p= *p + QP2;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1501 else if(*p - QP2 > f) *p= *p - QP2;
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1502 else *p=f;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1503 }
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1504 }
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1505 }
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1506 #ifdef DEBUG_DERING_THRESHOLD
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1507 if(max-min < 20)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1508 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1509 for(y=1; y<9; y++)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1510 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1511 int x;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1512 int t = 0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1513 p= src + stride*y;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1514 for(x=1; x<9; x++)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1515 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1516 p++;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1517 *p = MIN(*p + 20, 255);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1518 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1519 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1520 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1521 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1522 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1523 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1524 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1525
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1526 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1527 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1528 * will be called for every 8x8 block and can read & write from line 4-15
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1529 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1530 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1531 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1532 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1533 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1534 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1535 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1536 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1537 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1538 "leal (%%eax, %1, 4), %%ecx \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1539 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1540 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1541
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1542 "movq (%0), %%mm0 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1543 "movq (%%eax, %1), %%mm1 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1544 PAVGB(%%mm1, %%mm0)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1545 "movq %%mm0, (%%eax) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1546 "movq (%0, %1, 4), %%mm0 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1547 PAVGB(%%mm0, %%mm1)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1548 "movq %%mm1, (%%eax, %1, 2) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1549 "movq (%%ecx, %1), %%mm1 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1550 PAVGB(%%mm1, %%mm0)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1551 "movq %%mm0, (%%ecx) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1552 "movq (%0, %1, 8), %%mm0 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1553 PAVGB(%%mm0, %%mm1)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1554 "movq %%mm1, (%%ecx, %1, 2) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1555
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1556 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1557 : "%eax", "%ecx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1558 );
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1559 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1560 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1561 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1562 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1563 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1564 src[stride] = (src[0] + src[stride*2])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1565 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1566 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1567 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1568 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1569 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1570 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1571 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1572
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1573 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1574 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1575 * will be called for every 8x8 block and can read & write from line 4-15
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1576 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1577 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1578 * this filter will read lines 3-15 and write 7-13
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1579 * no cliping in C version
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1580 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1581 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1582 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1584 src+= stride*3;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1585 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1586 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1587 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1588 "leal (%%edx, %1, 4), %%ecx \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1589 "addl %1, %%ecx \n\t"
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1590 "pxor %%mm7, %%mm7 \n\t"
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1591 // 0 1 2 3 4 5 6 7 8 9 10
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1592 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1593
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1594 #define DEINT_CUBIC(a,b,c,d,e)\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1595 "movq " #a ", %%mm0 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1596 "movq " #b ", %%mm1 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1597 "movq " #d ", %%mm2 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1598 "movq " #e ", %%mm3 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1599 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1600 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1601 "movq %%mm0, %%mm2 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1602 "punpcklbw %%mm7, %%mm0 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1603 "punpckhbw %%mm7, %%mm2 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1604 "movq %%mm1, %%mm3 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1605 "punpcklbw %%mm7, %%mm1 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1606 "punpckhbw %%mm7, %%mm3 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1607 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1608 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1609 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1610 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1611 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1612 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1613 "packuswb %%mm3, %%mm1 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1614 "movq %%mm1, " #c " \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1615
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1616 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1617 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1618 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1619 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1620
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1621 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1622 : "%eax", "%edx", "ecx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1623 );
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1624 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1625 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1626 src+= stride*3;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1627 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1628 {
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1629 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1630 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1631 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1632 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1633 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1634 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1635 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1636 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1637
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1638 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1639 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1640 * will be called for every 8x8 block and can read & write from line 4-15
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1641 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1642 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1643 * this filter will read lines 4-13 and write 5-11
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1644 * no cliping in C version
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1645 */
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1646 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1647 {
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1648 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1649 src+= stride*4;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1650 asm volatile(
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1651 "leal (%0, %1), %%eax \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1652 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1653 "pxor %%mm7, %%mm7 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1654 "movq (%2), %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1655 // 0 1 2 3 4 5 6 7 8 9 10
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1656 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1657
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1658 #define DEINT_FF(a,b,c,d)\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1659 "movq " #a ", %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1660 "movq " #b ", %%mm2 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1661 "movq " #c ", %%mm3 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1662 "movq " #d ", %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1663 PAVGB(%%mm3, %%mm1) \
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1664 PAVGB(%%mm4, %%mm0) \
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1665 "movq %%mm0, %%mm3 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1666 "punpcklbw %%mm7, %%mm0 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1667 "punpckhbw %%mm7, %%mm3 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1668 "movq %%mm1, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1669 "punpcklbw %%mm7, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1670 "punpckhbw %%mm7, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1671 "psllw $2, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1672 "psllw $2, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1673 "psubw %%mm0, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1674 "psubw %%mm3, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1675 "movq %%mm2, %%mm5 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1676 "movq %%mm2, %%mm0 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1677 "punpcklbw %%mm7, %%mm2 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1678 "punpckhbw %%mm7, %%mm5 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1679 "paddw %%mm2, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1680 "paddw %%mm5, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1681 "psraw $2, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1682 "psraw $2, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1683 "packuswb %%mm4, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1684 "movq %%mm1, " #b " \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1685
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1686 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1687 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) )
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1688 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1689 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1690
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1691 "movq %%mm0, (%2) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1692 : : "r" (src), "r" (stride), "r"(tmp)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1693 : "%eax", "%edx"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1694 );
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1695 #else
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1696 int x;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1697 src+= stride*4;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1698 for(x=0; x<8; x++)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1699 {
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1700 int t1= tmp[x];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1701 int t2= src[stride*1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1702
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1703 src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1704 t1= src[stride*4];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1705 src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1706 t2= src[stride*6];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1707 src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1708 t1= src[stride*8];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1709 src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1710 tmp[x]= t1;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1711
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1712 src++;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1713 }
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1714 #endif
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1715 }
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1716
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1717 /**
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1718 * Deinterlaces the given block
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1719 * will be called for every 8x8 block and can read & write from line 4-15
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1720 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1721 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1722 * will shift the image up by 1 line (FIXME if this is a problem)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1723 * this filter will read lines 4-13 and write 4-11
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1724 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1725 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1726 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1727 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1728 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1729 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1730 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1731 "leal (%%eax, %1, 4), %%edx \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1732 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1733 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1734
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1735 "movq (%0), %%mm0 \n\t" // L0
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1736 "movq (%%eax, %1), %%mm1 \n\t" // L2
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1737 PAVGB(%%mm1, %%mm0) // L0+L2
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1738 "movq (%%eax), %%mm2 \n\t" // L1
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1739 PAVGB(%%mm2, %%mm0)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1740 "movq %%mm0, (%0) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1741 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1742 PAVGB(%%mm0, %%mm2) // L1+L3
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1743 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1744 "movq %%mm2, (%%eax) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1745 "movq (%0, %1, 4), %%mm2 \n\t" // L4
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1746 PAVGB(%%mm2, %%mm1) // L2+L4
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1747 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1748 "movq %%mm1, (%%eax, %1) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1749 "movq (%%edx), %%mm1 \n\t" // L5
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1750 PAVGB(%%mm1, %%mm0) // L3+L5
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1751 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1752 "movq %%mm0, (%%eax, %1, 2) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1753 "movq (%%edx, %1), %%mm0 \n\t" // L6
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1754 PAVGB(%%mm0, %%mm2) // L4+L6
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1755 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1756 "movq %%mm2, (%0, %1, 4) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1757 "movq (%%edx, %1, 2), %%mm2 \n\t" // L7
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1758 PAVGB(%%mm2, %%mm1) // L5+L7
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1759 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1760 "movq %%mm1, (%%edx) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1761 "movq (%0, %1, 8), %%mm1 \n\t" // L8
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1762 PAVGB(%%mm1, %%mm0) // L6+L8
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1763 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1764 "movq %%mm0, (%%edx, %1) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1765 "movq (%%edx, %1, 4), %%mm0 \n\t" // L9
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1766 PAVGB(%%mm0, %%mm2) // L7+L9
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1767 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1768 "movq %%mm2, (%%edx, %1, 2) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1769
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1770
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1771 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1772 : "%eax", "%edx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1773 );
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1774 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1775 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1776 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1777 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1778 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1779 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1780 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1781 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1782 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1783 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1784 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1785 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1786 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1787 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1788 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1789 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1790 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1791
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1792 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1793 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1794 * will be called for every 8x8 block and can read & write from line 4-15,
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1795 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1796 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1797 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1798 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1799 {
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1800 #ifdef HAVE_MMX
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1801 src+= 4*stride;
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1802 #ifdef HAVE_MMX2
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1803 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1804 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1805 "leal (%%eax, %1, 4), %%edx \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1806 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1807 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1808
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1809 "movq (%0), %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1810 "movq (%%eax, %1), %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1811 "movq (%%eax), %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1812 "movq %%mm0, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1813 "pmaxub %%mm1, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1814 "pminub %%mm3, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1815 "pmaxub %%mm2, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1816 "pminub %%mm1, %%mm0 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1817 "movq %%mm0, (%%eax) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1818
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1819 "movq (%0, %1, 4), %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1820 "movq (%%eax, %1, 2), %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1821 "movq %%mm2, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1822 "pmaxub %%mm1, %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1823 "pminub %%mm3, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1824 "pmaxub %%mm0, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1825 "pminub %%mm1, %%mm2 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1826 "movq %%mm2, (%%eax, %1, 2) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1827
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1828 "movq (%%edx), %%mm2 \n\t" //
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1829 "movq (%%edx, %1), %%mm1 \n\t" //
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1830 "movq %%mm2, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1831 "pmaxub %%mm0, %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1832 "pminub %%mm3, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1833 "pmaxub %%mm1, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1834 "pminub %%mm0, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1835 "movq %%mm2, (%%edx) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1836
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1837 "movq (%%edx, %1, 2), %%mm2 \n\t" //
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1838 "movq (%0, %1, 8), %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1839 "movq %%mm2, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1840 "pmaxub %%mm0, %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1841 "pminub %%mm3, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1842 "pmaxub %%mm1, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1843 "pminub %%mm0, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1844 "movq %%mm2, (%%edx, %1, 2) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1845
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1846
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1847 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1848 : "%eax", "%edx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1849 );
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1850
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1851 #else // MMX without MMX2
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1852 asm volatile(
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1853 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1854 "leal (%%eax, %1, 4), %%edx \n\t"
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1855 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1856 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1857 "pxor %%mm7, %%mm7 \n\t"
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1858
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1859 #define MEDIAN(a,b,c)\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1860 "movq " #a ", %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1861 "movq " #b ", %%mm2 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1862 "movq " #c ", %%mm1 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1863 "movq %%mm0, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1864 "movq %%mm1, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1865 "movq %%mm2, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1866 "psubusb %%mm1, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1867 "psubusb %%mm2, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1868 "psubusb %%mm0, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1869 "pcmpeqb %%mm7, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1870 "pcmpeqb %%mm7, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1871 "pcmpeqb %%mm7, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1872 "movq %%mm3, %%mm6 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1873 "pxor %%mm4, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1874 "pxor %%mm5, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1875 "pxor %%mm6, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1876 "por %%mm3, %%mm1 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1877 "por %%mm4, %%mm2 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1878 "por %%mm5, %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1879 "pand %%mm2, %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1880 "pand %%mm1, %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1881 "movq %%mm0, " #b " \n\t"
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1882
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1883 MEDIAN((%0), (%%eax), (%%eax, %1))
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1884 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1885 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1886 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1887
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1888 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1889 : "%eax", "%edx"
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1890 );
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1891 #endif // MMX
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1892 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1893 //FIXME
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1894 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1895 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1896 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1897 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1898 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1899 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1900 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1901 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1902 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1903 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1904 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1905 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1906 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1907 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1908 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1909 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1910
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1911 #ifdef HAVE_MMX
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1912 /**
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1913 * transposes and shift the given 8x8 Block into dst1 and dst2
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1914 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1915 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1916 {
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1917 asm(
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1918 "leal (%0, %1), %%eax \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1919 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1920 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1921 "movq (%0), %%mm0 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1922 "movq (%%eax), %%mm1 \n\t" // abcdefgh
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1923 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1924 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1925 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1926
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1927 "movq (%%eax, %1), %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1928 "movq (%%eax, %1, 2), %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1929 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1930 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1931 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1932
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1933 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1934 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1935 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1936 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1937 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1938 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1939
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1940 "movd %%mm0, 128(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1941 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1942 "movd %%mm0, 144(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1943 "movd %%mm3, 160(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1944 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1945 "movd %%mm3, 176(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1946 "movd %%mm3, 48(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1947 "movd %%mm2, 192(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1948 "movd %%mm2, 64(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1949 "psrlq $32, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1950 "movd %%mm2, 80(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1951 "movd %%mm1, 96(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1952 "psrlq $32, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1953 "movd %%mm1, 112(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1954
7952
be43106d6329 using fewer registers (fixes compilation bug hopefully)
michael
parents: 7948
diff changeset
1955 "leal (%%eax, %1, 4), %%eax \n\t"
be43106d6329 using fewer registers (fixes compilation bug hopefully)
michael
parents: 7948
diff changeset
1956
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1957 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
7952
be43106d6329 using fewer registers (fixes compilation bug hopefully)
michael
parents: 7948
diff changeset
1958 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1959 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1960 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1961 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1962
7952
be43106d6329 using fewer registers (fixes compilation bug hopefully)
michael
parents: 7948
diff changeset
1963 "movq (%%eax, %1), %%mm1 \n\t"
be43106d6329 using fewer registers (fixes compilation bug hopefully)
michael
parents: 7948
diff changeset
1964 "movq (%%eax, %1, 2), %%mm3 \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1965 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1966 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1967 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1968
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1969 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1970 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1971 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1972 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1973 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1974 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1975
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1976 "movd %%mm0, 132(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1977 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1978 "movd %%mm0, 148(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1979 "movd %%mm3, 164(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1980 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1981 "movd %%mm3, 180(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1982 "movd %%mm3, 52(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1983 "movd %%mm2, 196(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1984 "movd %%mm2, 68(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1985 "psrlq $32, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1986 "movd %%mm2, 84(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1987 "movd %%mm1, 100(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1988 "psrlq $32, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1989 "movd %%mm1, 116(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1990
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1991
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1992 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
7952
be43106d6329 using fewer registers (fixes compilation bug hopefully)
michael
parents: 7948
diff changeset
1993 : "%eax"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1994 );
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1995 }
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1996
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1997 /**
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1998 * transposes the given 8x8 block
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1999 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2000 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2001 {
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2002 asm(
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2003 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2004 "leal (%%eax, %1, 4), %%edx \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2005 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2006 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2007 "movq (%2), %%mm0 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2008 "movq 16(%2), %%mm1 \n\t" // abcdefgh
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2009 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2010 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2011 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2012
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2013 "movq 32(%2), %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2014 "movq 48(%2), %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2015 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2016 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2017 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2018
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2019 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2020 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2021 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2022 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2023 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2024 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2025
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2026 "movd %%mm0, (%0) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2027 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2028 "movd %%mm0, (%%eax) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2029 "movd %%mm3, (%%eax, %1) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2030 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2031 "movd %%mm3, (%%eax, %1, 2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2032 "movd %%mm2, (%0, %1, 4) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2033 "psrlq $32, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2034 "movd %%mm2, (%%edx) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2035 "movd %%mm1, (%%edx, %1) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2036 "psrlq $32, %%mm1 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2037 "movd %%mm1, (%%edx, %1, 2) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2038
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2039
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2040 "movq 64(%2), %%mm0 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2041 "movq 80(%2), %%mm1 \n\t" // abcdefgh
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2042 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2043 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2044 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2045
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2046 "movq 96(%2), %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2047 "movq 112(%2), %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2048 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2049 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2050 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2051
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2052 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2053 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2054 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2055 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2056 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2057 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2058
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2059 "movd %%mm0, 4(%0) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2060 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2061 "movd %%mm0, 4(%%eax) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2062 "movd %%mm3, 4(%%eax, %1) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2063 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2064 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2065 "movd %%mm2, 4(%0, %1, 4) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2066 "psrlq $32, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2067 "movd %%mm2, 4(%%edx) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2068 "movd %%mm1, 4(%%edx, %1) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2069 "psrlq $32, %%mm1 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2070 "movd %%mm1, 4(%%edx, %1, 2) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2071
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2072 :: "r" (dst), "r" (dstStride), "r" (src)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2073 : "%eax", "%edx"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2074 );
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2075 }
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
2076 #endif
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2077 //static int test=0;
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2078
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2079 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2080 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2081 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2082 // to save a register (FIXME do this outside of the loops)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2083 tempBluredPast[127]= maxNoise[0];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2084 tempBluredPast[128]= maxNoise[1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2085 tempBluredPast[129]= maxNoise[2];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2086
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2087 #define FAST_L2_DIFF
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2088 //#define L1_DIFF //u should change the thresholds too if u try that one
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2089 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2090 asm volatile(
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2091 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2092 "leal (%2, %2, 4), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2093 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2094 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2095 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2096 //FIXME reorder?
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2097 #ifdef L1_DIFF //needs mmx2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2098 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2099 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2100 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2101 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2102 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2103 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2104 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2105 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2106
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2107 "movq (%0, %2, 4), %%mm4 \n\t" // L4
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2108 "paddw %%mm1, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2109 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2110 "movq (%0, %%edx), %%mm5 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2111 "paddw %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2112 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5|
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2113 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2114 "paddw %%mm3, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2115 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2116 "movq (%0, %%ecx), %%mm7 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2117 "paddw %%mm4, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2118 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2119 "paddw %%mm5, %%mm6 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2120 "paddw %%mm7, %%mm6 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2121 "paddw %%mm6, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2122 #elif defined (FAST_L2_DIFF)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2123 "pcmpeqb %%mm7, %%mm7 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
2124 "movq "MANGLE(b80)", %%mm6 \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2125 "pxor %%mm0, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2126 #define L2_DIFF_CORE(a, b)\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2127 "movq " #a ", %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2128 "movq " #b ", %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2129 "pxor %%mm7, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2130 PAVGB(%%mm2, %%mm5)\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2131 "paddb %%mm6, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2132 "movq %%mm5, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2133 "psllw $8, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2134 "pmaddwd %%mm5, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2135 "pmaddwd %%mm2, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2136 "paddd %%mm2, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2137 "psrld $14, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2138 "paddd %%mm5, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2139
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2140 L2_DIFF_CORE((%0), (%1))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2141 L2_DIFF_CORE((%0, %2), (%1, %2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2142 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2143 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2144 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2145 L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2146 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2147 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2148
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2149 #else
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2150 "pxor %%mm7, %%mm7 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2151 "pxor %%mm0, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2152 #define L2_DIFF_CORE(a, b)\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2153 "movq " #a ", %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2154 "movq " #b ", %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2155 "movq %%mm5, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2156 "movq %%mm2, %%mm3 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2157 "punpcklbw %%mm7, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2158 "punpckhbw %%mm7, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2159 "punpcklbw %%mm7, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2160 "punpckhbw %%mm7, %%mm3 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2161 "psubw %%mm2, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2162 "psubw %%mm3, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2163 "pmaddwd %%mm5, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2164 "pmaddwd %%mm1, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2165 "paddd %%mm1, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2166 "paddd %%mm5, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2167
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2168 L2_DIFF_CORE((%0), (%1))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2169 L2_DIFF_CORE((%0, %2), (%1, %2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2170 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2171 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2172 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2173 L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2174 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2175 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2176
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2177 #endif
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2178
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2179 "movq %%mm0, %%mm4 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2180 "psrlq $32, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2181 "paddd %%mm0, %%mm4 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2182 "movd %%mm4, %%ecx \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2183 "shll $2, %%ecx \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2184 "movl %3, %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2185 "addl -4(%%edx), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2186 "addl 4(%%edx), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2187 "addl -1024(%%edx), %%ecx \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2188 "addl $4, %%ecx \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2189 "addl 1024(%%edx), %%ecx \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2190 "shrl $3, %%ecx \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2191 "movl %%ecx, (%%edx) \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2192
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
2193 // "movl %3, %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2194 // "movl %%ecx, test \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2195 // "jmp 4f \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2196 "cmpl 512(%%edx), %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2197 " jb 2f \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2198 "cmpl 516(%%edx), %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2199 " jb 1f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2200
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2201 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2202 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2203 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2204 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2205 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2206 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2207 "movq (%0, %2, 4), %%mm4 \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2208 "movq (%0, %%edx), %%mm5 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2209 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2210 "movq (%0, %%ecx), %%mm7 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2211 "movq %%mm0, (%1) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2212 "movq %%mm1, (%1, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2213 "movq %%mm2, (%1, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2214 "movq %%mm3, (%1, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2215 "movq %%mm4, (%1, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2216 "movq %%mm5, (%1, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2217 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2218 "movq %%mm7, (%1, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2219 "jmp 4f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2220
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2221 "1: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2222 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2223 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2224 "movq (%0), %%mm0 \n\t" // L0
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2225 PAVGB((%1), %%mm0) // L0
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2226 "movq (%0, %2), %%mm1 \n\t" // L1
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2227 PAVGB((%1, %2), %%mm1) // L1
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2228 "movq (%0, %2, 2), %%mm2 \n\t" // L2
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2229 PAVGB((%1, %2, 2), %%mm2) // L2
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2230 "movq (%0, %%eax), %%mm3 \n\t" // L3
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2231 PAVGB((%1, %%eax), %%mm3) // L3
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2232 "movq (%0, %2, 4), %%mm4 \n\t" // L4
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2233 PAVGB((%1, %2, 4), %%mm4) // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2234 "movq (%0, %%edx), %%mm5 \n\t" // L5
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2235 PAVGB((%1, %%edx), %%mm5) // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2236 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2237 PAVGB((%1, %%eax, 2), %%mm6) // L6
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2238 "movq (%0, %%ecx), %%mm7 \n\t" // L7
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2239 PAVGB((%1, %%ecx), %%mm7) // L7
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2240 "movq %%mm0, (%1) \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2241 "movq %%mm1, (%1, %2) \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2242 "movq %%mm2, (%1, %2, 2) \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2243 "movq %%mm3, (%1, %%eax) \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2244 "movq %%mm4, (%1, %2, 4) \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2245 "movq %%mm5, (%1, %%edx) \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2246 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2247 "movq %%mm7, (%1, %%ecx) \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2248 "movq %%mm0, (%0) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2249 "movq %%mm1, (%0, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2250 "movq %%mm2, (%0, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2251 "movq %%mm3, (%0, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2252 "movq %%mm4, (%0, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2253 "movq %%mm5, (%0, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2254 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2255 "movq %%mm7, (%0, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2256 "jmp 4f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2257
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2258 "2: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2259 "cmpl 508(%%edx), %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2260 " jb 3f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2261
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2262 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2263 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2264 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2265 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2266 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2267 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2268 "movq (%1), %%mm4 \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2269 "movq (%1, %2), %%mm5 \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2270 "movq (%1, %2, 2), %%mm6 \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2271 "movq (%1, %%eax), %%mm7 \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2272 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2273 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2274 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2275 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2276 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2277 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2278 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2279 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2280 "movq %%mm0, (%1) \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2281 "movq %%mm1, (%1, %2) \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2282 "movq %%mm2, (%1, %2, 2) \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2283 "movq %%mm3, (%1, %%eax) \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2284 "movq %%mm0, (%0) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2285 "movq %%mm1, (%0, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2286 "movq %%mm2, (%0, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2287 "movq %%mm3, (%0, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2288
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2289 "movq (%0, %2, 4), %%mm0 \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2290 "movq (%0, %%edx), %%mm1 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2291 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2292 "movq (%0, %%ecx), %%mm3 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2293 "movq (%1, %2, 4), %%mm4 \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2294 "movq (%1, %%edx), %%mm5 \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2295 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2296 "movq (%1, %%ecx), %%mm7 \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2297 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2298 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2299 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2300 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2301 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2302 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2303 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2304 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2305 "movq %%mm0, (%1, %2, 4) \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2306 "movq %%mm1, (%1, %%edx) \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2307 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2308 "movq %%mm3, (%1, %%ecx) \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2309 "movq %%mm0, (%0, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2310 "movq %%mm1, (%0, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2311 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2312 "movq %%mm3, (%0, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2313 "jmp 4f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2314
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2315 "3: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2316 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2317 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2318 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2319 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2320 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2321 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2322 "movq (%1), %%mm4 \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2323 "movq (%1, %2), %%mm5 \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2324 "movq (%1, %2, 2), %%mm6 \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2325 "movq (%1, %%eax), %%mm7 \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2326 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2327 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2328 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2329 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2330 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2331 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2332 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2333 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2334 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2335 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2336 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2337 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2338 "movq %%mm0, (%1) \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2339 "movq %%mm1, (%1, %2) \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2340 "movq %%mm2, (%1, %2, 2) \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2341 "movq %%mm3, (%1, %%eax) \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2342 "movq %%mm0, (%0) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2343 "movq %%mm1, (%0, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2344 "movq %%mm2, (%0, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2345 "movq %%mm3, (%0, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2346
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2347 "movq (%0, %2, 4), %%mm0 \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2348 "movq (%0, %%edx), %%mm1 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2349 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2350 "movq (%0, %%ecx), %%mm3 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2351 "movq (%1, %2, 4), %%mm4 \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2352 "movq (%1, %%edx), %%mm5 \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2353 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2354 "movq (%1, %%ecx), %%mm7 \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2355 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2356 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2357 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2358 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2359 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2360 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2361 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2362 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2363 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2364 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2365 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2366 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2367 "movq %%mm0, (%1, %2, 4) \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2368 "movq %%mm1, (%1, %%edx) \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2369 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2370 "movq %%mm3, (%1, %%ecx) \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2371 "movq %%mm0, (%0, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2372 "movq %%mm1, (%0, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2373 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2374 "movq %%mm3, (%0, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2375
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2376 "4: \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2377
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2378 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2379 : "%eax", "%edx", "%ecx", "memory"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2380 );
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2381 //printf("%d\n", test);
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2382 #else
7948
5a6cbe774760 fix compilation on non-x86 with gcc 2.95
colin
parents: 7946
diff changeset
2383 {
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2384 int y;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2385 int d=0;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2386 int sysd=0;
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2387 int i;
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2388
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2389 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2390 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2391 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2392 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2393 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2394 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2395 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2396 int d1=ref - cur;
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2397 // if(x==0 || x==7) d1+= d1>>1;
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2398 // if(y==0 || y==7) d1+= d1>>1;
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2399 // d+= ABS(d1);
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2400 d+= d1*d1;
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2401 sysd+= d1;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2402 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2403 }
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2404 i=d;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2405 d= (
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2406 4*d
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2407 +(*(tempBluredPast-256))
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2408 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2409 +(*(tempBluredPast+256))
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2410 +4)>>3;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2411 *tempBluredPast=i;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2412 // ((*tempBluredPast)*3 + d + 2)>>2;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2413
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2414 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2415 /*
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2416 Switch between
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2417 1 0 0 0 0 0 0 (0)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2418 64 32 16 8 4 2 1 (1)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2419 64 48 36 27 20 15 11 (33) (approx)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2420 64 56 49 43 37 33 29 (200) (approx)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2421 */
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2422 if(d > maxNoise[1])
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2423 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2424 if(d < maxNoise[2])
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2425 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2426 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2427 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2428 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2429 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2430 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2431 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2432 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2433 tempBlured[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2434 src[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2435 (ref + cur + 1)>>1;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2436 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2437 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2438 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2439 else
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2440 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2441 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2442 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2443 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2444 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2445 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2446 tempBlured[ x + y*stride ]= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2447 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2448 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2449 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2450 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2451 else
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2452 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2453 if(d < maxNoise[0])
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2454 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2455 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2456 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2457 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2458 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2459 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2460 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2461 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2462 tempBlured[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2463 src[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2464 (ref*7 + cur + 4)>>3;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2465 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2466 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2467 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2468 else
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2469 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2470 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2471 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2472 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2473 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2474 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2475 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2476 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2477 tempBlured[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2478 src[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2479 (ref*3 + cur + 2)>>2;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2480 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2481 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2482 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2483 }
7948
5a6cbe774760 fix compilation on non-x86 with gcc 2.95
colin
parents: 7946
diff changeset
2484 }
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2485 #endif
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2486 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2487
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2488 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2489 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2490
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2491 /**
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2492 * Copies a block from src to dst and fixes the blacklevel
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2493 * levelFix == 0 -> dont touch the brighness & contrast
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2494 */
7220
e3ecccc7e505 warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents: 6949
diff changeset
2495 #undef SCALED_CPY
e3ecccc7e505 warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents: 6949
diff changeset
2496
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2497 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2498 int levelFix, int64_t *packedOffsetAndScale)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2499 {
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
2500 #ifndef HAVE_MMX
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2501 int i;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
2502 #endif
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2503 if(levelFix)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2504 {
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2505 #ifdef HAVE_MMX
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2506 asm volatile(
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2507 "movq (%%eax), %%mm2 \n\t" // packedYOffset
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2508 "movq 8(%%eax), %%mm3 \n\t" // packedYScale
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2509 "leal (%2,%4), %%eax \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2510 "leal (%3,%5), %%edx \n\t"
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2511 "pxor %%mm4, %%mm4 \n\t"
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2512 #ifdef HAVE_MMX2
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2513 #define SCALED_CPY(src1, src2, dst1, dst2) \
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2514 "movq " #src1 ", %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2515 "movq " #src1 ", %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2516 "movq " #src2 ", %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2517 "movq " #src2 ", %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2518 "punpcklbw %%mm0, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2519 "punpckhbw %%mm5, %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2520 "punpcklbw %%mm1, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2521 "punpckhbw %%mm6, %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2522 "pmulhuw %%mm3, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2523 "pmulhuw %%mm3, %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2524 "pmulhuw %%mm3, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2525 "pmulhuw %%mm3, %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2526 "psubw %%mm2, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2527 "psubw %%mm2, %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2528 "psubw %%mm2, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2529 "psubw %%mm2, %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2530 "packuswb %%mm5, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2531 "packuswb %%mm6, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2532 "movq %%mm0, " #dst1 " \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2533 "movq %%mm1, " #dst2 " \n\t"\
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2534
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2535 #else //HAVE_MMX2
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2536 #define SCALED_CPY(src1, src2, dst1, dst2) \
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2537 "movq " #src1 ", %%mm0 \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2538 "movq " #src1 ", %%mm5 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2539 "punpcklbw %%mm4, %%mm0 \n\t"\
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2540 "punpckhbw %%mm4, %%mm5 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2541 "psubw %%mm2, %%mm0 \n\t"\
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2542 "psubw %%mm2, %%mm5 \n\t"\
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2543 "movq " #src2 ", %%mm1 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2544 "psllw $6, %%mm0 \n\t"\
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2545 "psllw $6, %%mm5 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2546 "pmulhw %%mm3, %%mm0 \n\t"\
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2547 "movq " #src2 ", %%mm6 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2548 "pmulhw %%mm3, %%mm5 \n\t"\
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2549 "punpcklbw %%mm4, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2550 "punpckhbw %%mm4, %%mm6 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2551 "psubw %%mm2, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2552 "psubw %%mm2, %%mm6 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2553 "psllw $6, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2554 "psllw $6, %%mm6 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2555 "pmulhw %%mm3, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2556 "pmulhw %%mm3, %%mm6 \n\t"\
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2557 "packuswb %%mm5, %%mm0 \n\t"\
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2558 "packuswb %%mm6, %%mm1 \n\t"\
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2559 "movq %%mm0, " #dst1 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2560 "movq %%mm1, " #dst2 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2561
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2562 #endif //!HAVE_MMX2
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2563
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2564 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2565 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2566 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2567 "leal (%%eax,%4,4), %%eax \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2568 "leal (%%edx,%5,4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2569 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2570
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2571
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2572 : "=&a" (packedOffsetAndScale)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2573 : "0" (packedOffsetAndScale),
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2574 "r"(src),
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2575 "r"(dst),
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2576 "r" (srcStride),
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2577 "r" (dstStride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2578 : "%edx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2579 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2580 #else
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2581 for(i=0; i<8; i++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2582 memcpy( &(dst[dstStride*i]),
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2583 &(src[srcStride*i]), BLOCK_SIZE);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2584 #endif
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2585 }
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2586 else
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2587 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2588 #ifdef HAVE_MMX
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2589 asm volatile(
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2590 "leal (%0,%2), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2591 "leal (%1,%3), %%edx \n\t"
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2592
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2593 #define SIMPLE_CPY(src1, src2, dst1, dst2) \
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2594 "movq " #src1 ", %%mm0 \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2595 "movq " #src2 ", %%mm1 \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2596 "movq %%mm0, " #dst1 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2597 "movq %%mm1, " #dst2 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2598
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2599 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2600 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2601 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2602 "leal (%%eax,%2,4), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2603 "leal (%%edx,%3,4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2604 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2605
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2606 : : "r" (src),
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2607 "r" (dst),
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2608 "r" (srcStride),
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2609 "r" (dstStride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2610 : "%eax", "%edx"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2611 );
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2612 #else
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2613 for(i=0; i<8; i++)
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2614 memcpy( &(dst[dstStride*i]),
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2615 &(src[srcStride*i]), BLOCK_SIZE);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2616 #endif
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2617 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2618 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2619
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2620 /**
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2621 * Duplicates the given 8 src pixels ? times upward
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2622 */
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2623 static inline void RENAME(duplicate)(uint8_t src[], int stride)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2624 {
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2625 #ifdef HAVE_MMX
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2626 asm volatile(
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2627 "movq (%0), %%mm0 \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2628 "addl %1, %0 \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2629 "movq %%mm0, (%0) \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2630 "movq %%mm0, (%0, %1) \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2631 "movq %%mm0, (%0, %1, 2) \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2632 : "+r" (src)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2633 : "r" (-stride)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2634 );
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2635 #else
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2636 int i;
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2637 uint8_t *p=src;
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2638 for(i=0; i<3; i++)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2639 {
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2640 p-= stride;
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2641 memcpy(p, src, 8);
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2642 }
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2643 #endif
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2644 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2645
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2646 /**
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2647 * Filters array of bytes (Y or U or V values)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2648 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2649 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2650 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2651 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2652 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2653 int x,y;
3154
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2654 #ifdef COMPILE_TIME_MODE
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2655 const int mode= COMPILE_TIME_MODE;
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2656 #else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2657 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3154
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2658 #endif
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2659 int black=0, white=255; // blackest black and whitest white in the picture
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2660 int QPCorrecture= 256*256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2661
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2662 int copyAhead, i;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2663
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2664 //FIXME remove
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2665 uint64_t * const yHistogram= c.yHistogram;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2666 uint8_t * const tempSrc= c.tempSrc;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2667 uint8_t * const tempDst= c.tempDst;
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2668 const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3832
d05cfaf5f0f2 minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents: 3817
diff changeset
2669
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2670 #ifdef HAVE_MMX
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2671 for(i=0; i<32; i++){
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2672 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2673 int threshold= offset*2 + 1;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2674 c.mmxDcOffset[i]= 0x7F - offset;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2675 c.mmxDcThreshold[i]= 0x7F - threshold;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2676 c.mmxDcOffset[i]*= 0x0101010101010101LL;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2677 c.mmxDcThreshold[i]*= 0x0101010101010101LL;
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2678 }
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2679 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2680
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2681 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2682 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2683 || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2684 else if( (mode & V_DEBLOCK)
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2685 || (mode & LINEAR_IPOL_DEINT_FILTER)
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2686 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2687 else if(mode & V_X1_FILTER) copyAhead=11;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2688 // else if(mode & V_RK1_FILTER) copyAhead=10;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2689 else if(mode & DERING) copyAhead=9;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2690 else copyAhead=8;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2691
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2692 copyAhead-= 8;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2693
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2694 if(!isColor)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2695 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2696 uint64_t sum= 0;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2697 int i;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2698 uint64_t maxClipped;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2699 uint64_t clipped;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2700 double scale;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2701
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2702 c.frameNum++;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2703 // first frame is fscked so we ignore it
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2704 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2705
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2706 for(i=0; i<256; i++)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2707 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2708 sum+= yHistogram[i];
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2709 // printf("%d ", yHistogram[i]);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2710 }
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2711 // printf("\n\n");
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2712
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2713 /* we allways get a completly black picture first */
7963
0a5d69e6f2a2 cleanup
michael
parents: 7960
diff changeset
2714 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2715
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2716 clipped= sum;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2717 for(black=255; black>0; black--)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2718 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2719 if(clipped < maxClipped) break;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2720 clipped-= yHistogram[black];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2721 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2722
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2723 clipped= sum;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2724 for(white=0; white<256; white++)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2725 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2726 if(clipped < maxClipped) break;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2727 clipped-= yHistogram[white];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2728 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2729
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2730 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2731
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2732 #ifdef HAVE_MMX2
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2733 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2734 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2735 #else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2736 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2737 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2738 #endif
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2739
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2740 c.packedYOffset|= c.packedYOffset<<32;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2741 c.packedYOffset|= c.packedYOffset<<16;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2742
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2743 c.packedYScale|= c.packedYScale<<32;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2744 c.packedYScale|= c.packedYScale<<16;
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2745
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2746 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2747 else QPCorrecture= 256*256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2748 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2749 else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2750 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2751 c.packedYScale= 0x0100010001000100LL;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2752 c.packedYOffset= 0;
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2753 QPCorrecture= 256*256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2754 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2755
2742
d5636499cafd minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents: 2595
diff changeset
2756 /* copy & deinterlace first row of blocks */
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2757 y=-BLOCK_SIZE;
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2758 {
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2759 uint8_t *srcBlock= &(src[y*srcStride]);
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2760 uint8_t *dstBlock= tempDst + dstStride;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2761
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2762 // From this point on it is guranteed that we can read and write 16 lines downward
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2763 // finish 1 block before the next otherwise we´ll might have a problem
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2764 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2765 for(x=0; x<width; x+=BLOCK_SIZE)
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2766 {
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2767
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2768 #ifdef HAVE_MMX2
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2769 /*
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2770 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2771 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2772 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2773 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2774 */
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2775
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2776 asm(
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2777 "movl %4, %%eax \n\t"
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2778 "shrl $2, %%eax \n\t"
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2779 "andl $6, %%eax \n\t"
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2780 "addl %5, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2781 "movl %%eax, %%edx \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2782 "imul %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2783 "imul %3, %%edx \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2784 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2785 "prefetcht0 32(%%edx, %2) \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2786 "addl %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2787 "addl %3, %%edx \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2788 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2789 "prefetcht0 32(%%edx, %2) \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2790 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2791 "m" (x), "m" (copyAhead)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2792 : "%eax", "%edx"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2793 );
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2794
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2795 #elif defined(HAVE_3DNOW)
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2796 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2797 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2798 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2799 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2800 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2801 */
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2802 #endif
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2803
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2804 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2805 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2806
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2807 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2808
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2809 if(mode & LINEAR_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2810 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2811 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2812 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2813 else if(mode & MEDIAN_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2814 RENAME(deInterlaceMedian)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2815 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2816 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2817 else if(mode & FFMPEG_DEINT_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2818 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2819 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2820 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2821 */
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2822 dstBlock+=8;
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2823 srcBlock+=8;
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2824 }
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2825 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride );
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2826 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2827
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2828 //printf("\n");
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2829 for(y=0; y<height; y+=BLOCK_SIZE)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2830 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2831 //1% speedup if these are here instead of the inner loop
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2832 uint8_t *srcBlock= &(src[y*srcStride]);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2833 uint8_t *dstBlock= &(dst[y*dstStride]);
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2834 #ifdef HAVE_MMX
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2835 uint8_t *tempBlock1= c.tempBlocks;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2836 uint8_t *tempBlock2= c.tempBlocks + 8;
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2837 #endif
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2838 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2839 int *nonBQPptr= isColor ? &c.nonBQPTable[(y>>3)*mbWidth] :&c.nonBQPTable[(y>>4)*mbWidth];
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2840 int QP=0;
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2841 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2842 if not than use a temporary buffer */
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2843 if(y+15 >= height)
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2844 {
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2845 int i;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2846 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2847 blockcopy to dst later */
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2848 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2849 srcStride*MAX(height-y-copyAhead, 0) );
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2850
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2851 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2852 for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2853 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2854
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2855 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2856 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2857
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2858 /* duplicate last line of dst to fill the void upto line (copyAhead) */
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2859 for(i=height-y+1; i<=copyAhead; i++)
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2860 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2861
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2862 dstBlock= tempDst + dstStride;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2863 srcBlock= tempSrc;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2864 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2865 //printf("\n");
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2866
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
2867 // From this point on it is guranteed that we can read and write 16 lines downward
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2868 // finish 1 block before the next otherwise we´ll might have a problem
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2869 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2870 for(x=0; x<width; x+=BLOCK_SIZE)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2871 {
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2872 const int stride= dstStride;
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2873 #ifdef HAVE_MMX
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2874 uint8_t *tmpXchg;
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2875 #endif
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2876 if(isColor)
2428
85cda20c530f more speed
michael
parents: 2416
diff changeset
2877 {
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2878 QP= QPptr[x>>3];
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2879 c.nonBQP= nonBQPptr[x>>3];
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2880 }
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2881 else
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2882 {
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2883 QP= QPptr[x>>4];
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2884 QP= (QP* QPCorrecture + 256*128)>>16;
7960
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2885 c.nonBQP= nonBQPptr[x>>4];
0a4ab841ae29 better deblocking filter
michael
parents: 7952
diff changeset
2886 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
2742
d5636499cafd minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents: 2595
diff changeset
2887 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2428
85cda20c530f more speed
michael
parents: 2416
diff changeset
2888 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2889 c.QP= QP;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2890 #ifdef HAVE_MMX
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2891 asm volatile(
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2892 "movd %1, %%mm7 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2893 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2894 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2895 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2896 "movq %%mm7, %0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2897 : "=m" (c.pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2898 : "r" (QP)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2899 );
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2900 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2901
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2902
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2903 #ifdef HAVE_MMX2
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2904 /*
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2905 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2906 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2907 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2908 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2909 */
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2910
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2911 asm(
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2912 "movl %4, %%eax \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2913 "shrl $2, %%eax \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2914 "andl $6, %%eax \n\t"
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2915 "addl %5, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2916 "movl %%eax, %%edx \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2917 "imul %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2918 "imul %3, %%edx \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2919 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2920 "prefetcht0 32(%%edx, %2) \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2921 "addl %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2922 "addl %3, %%edx \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2923 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2924 "prefetcht0 32(%%edx, %2) \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2925 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2926 "m" (x), "m" (copyAhead)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2927 : "%eax", "%edx"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2928 );
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2929
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2930 #elif defined(HAVE_3DNOW)
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2931 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2932 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2933 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2934 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2935 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2936 */
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2937 #endif
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2938
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2939 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2940 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2941
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2942 if(mode & LINEAR_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2943 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2944 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2945 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2946 else if(mode & MEDIAN_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2947 RENAME(deInterlaceMedian)(dstBlock, dstStride);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2948 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2949 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2950 else if(mode & FFMPEG_DEINT_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2951 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2952 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2953 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
2954 */
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2955
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2956 /* only deblock if we have 2 blocks */
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2957 if(y + 8 < height)
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2958 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2959 if(mode & V_X1_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2960 RENAME(vertX1Filter)(dstBlock, stride, &c);
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2961 else if(mode & V_DEBLOCK)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2962 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2963 if( RENAME(isVertDC)(dstBlock, stride, &c))
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2964 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2965 if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2966 RENAME(doVertLowPass)(dstBlock, stride, &c);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2967 }
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2968 else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2969 RENAME(doVertDefFilter)(dstBlock, stride, &c);
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2970 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2971 }
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2972
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2973 #ifdef HAVE_MMX
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2974 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2975 #endif
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2976 /* check if we have a previous block to deblock it with dstBlock */
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
2977 if(x - 8 >= 0)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2978 {
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2979 #ifdef HAVE_MMX
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2980 if(mode & H_X1_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2981 RENAME(vertX1Filter)(tempBlock1, 16, &c);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2982 else if(mode & H_DEBLOCK)
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2983 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2984 if( RENAME(isVertDC)(tempBlock1, 16, &c))
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2985 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2986 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2987 RENAME(doVertLowPass)(tempBlock1, 16, &c);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2988 }
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2989 else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2990 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2991 }
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2992
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2993 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2994
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2995 #else
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2996 if(mode & H_X1_FILTER)
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2997 horizX1Filter(dstBlock-4, stride, QP);
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2998 else if(mode & H_DEBLOCK)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2999 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3000 if( isHorizDC(dstBlock-4, stride, &c))
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3001 {
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3002 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3003 doHorizLowPass(dstBlock-4, stride, QP);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3004 }
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
3005 else
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3006 doHorizDefFilter(dstBlock-4, stride, QP);
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3007 }
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3008 #endif
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3009 if(mode & DERING)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3010 {
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3011 //FIXME filter first line
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3012 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3013 }
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3014
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3015 if(mode & TEMP_NOISE_FILTER)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3016 {
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
3017 RENAME(tempNoiseReducer)(dstBlock-8, stride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3018 c.tempBlured[isColor] + y*dstStride + x,
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3019 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3020 c.ppMode.maxTmpNoise);
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3021 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3022 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3023
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3024 dstBlock+=8;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3025 srcBlock+=8;
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3026
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
3027 #ifdef HAVE_MMX
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3028 tmpXchg= tempBlock1;
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3029 tempBlock1= tempBlock2;
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3030 tempBlock2 = tmpXchg;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
3031 #endif
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3032 }
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3033
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3034 if(mode & DERING)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3035 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3036 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3037 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3038
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3039 if((mode & TEMP_NOISE_FILTER))
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3040 {
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
3041 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3042 c.tempBlured[isColor] + y*dstStride + x,
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3043 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3044 c.ppMode.maxTmpNoise);
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3045 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3046
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
3047 /* did we use a tmp buffer for the last lines*/
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
3048 if(y+15 >= height)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3049 {
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3050 uint8_t *dstBlock= &(dst[y*dstStride]);
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3051 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3052 }
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3053 /*
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3054 for(x=0; x<width; x+=32)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3055 {
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3056 volatile int i;
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3057 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3058 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3059 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3060 // + dstBlock[x +13*dstStride]
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3061 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3062 }*/
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3063 }
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3064 #ifdef HAVE_3DNOW
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3065 asm volatile("femms");
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3066 #elif defined (HAVE_MMX)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3067 asm volatile("emms");
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3068 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3069
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3070 #ifdef DEBUG_BRIGHTNESS
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3071 if(!isColor)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3072 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3073 int max=1;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3074 int i;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3075 for(i=0; i<256; i++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3076 if(yHistogram[i] > max) max=yHistogram[i];
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3077
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3078 for(i=1; i<256; i++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3079 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3080 int x;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3081 int start=yHistogram[i-1]/(max/256+1);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3082 int end=yHistogram[i]/(max/256+1);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3083 int inc= end > start ? 1 : -1;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3084 for(x=start; x!=end+inc; x+=inc)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3085 dst[ i*dstStride + x]+=128;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3086 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3087
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3088 for(i=0; i<100; i+=2)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3089 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3090 dst[ (white)*dstStride + i]+=128;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3091 dst[ (black)*dstStride + i]+=128;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3092 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3093
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3094 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3095 #endif
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3096
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3097 *c2= c; //copy local context back
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3098
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3099 }