annotate postproc/postprocess_template.c @ 7946:f483ab704252

postprocessing cleanup: remove opendivx #ifdefs remove rk1 filter remove unused / obsolete stuff add -1,4,2,4,-1 deinterlacing filter (ffmpeg uses that) threadsafe / no more non-const globals some optimizations different strides for Y,U,V possible remove ebx usage (someone really should fix gcc, this is really lame) change the dering filter slightly (tell me if its worse for any files)
author michael
date Mon, 28 Oct 2002 19:31:04 +0000
parents e3ecccc7e505
children 5a6cbe774760
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1 /*
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
4 This program is free software; you can redistribute it and/or modify
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
5 it under the terms of the GNU General Public License as published by
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
6 the Free Software Foundation; either version 2 of the License, or
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
7 (at your option) any later version.
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
8
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
9 This program is distributed in the hope that it will be useful,
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
12 GNU General Public License for more details.
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
13
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
14 You should have received a copy of the GNU General Public License
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
15 along with this program; if not, write to the Free Software
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
17 */
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
18
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
19 #undef PAVGB
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
20 #undef PMINUB
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
21 #undef PMAXUB
2189
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
22
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
23 #ifdef HAVE_MMX2
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
24 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
25 #elif defined (HAVE_3DNOW)
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
26 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
82556b3a1228 Cleanup:
arpi
parents: 2185
diff changeset
27 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
28
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
29 #ifdef HAVE_MMX2
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
30 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
31 #elif defined (HAVE_MMX)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
32 #define PMINUB(b,a,t) \
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
33 "movq " #a ", " #t " \n\t"\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
34 "psubusb " #b ", " #t " \n\t"\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
35 "psubb " #t ", " #a " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
36 #endif
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
37
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
38 #ifdef HAVE_MMX2
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
39 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
40 #elif defined (HAVE_MMX)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
41 #define PMAXUB(a,b) \
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
42 "psubusb " #a ", " #b " \n\t"\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
43 "paddb " #a ", " #b " \n\t"
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
44 #endif
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
45
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
46
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
47 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
48 #ifdef HAVE_MMX
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
49 /**
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
50 * Check if the middle 8x8 Block in the given 8x16 block is flat
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
51 */
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
52 static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
53 int numEq= 0;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
54 src+= stride*4; // src points to begin of the 8x8 Block
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
55 asm volatile(
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
56 "leal (%1, %2), %%eax \n\t"
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
57 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
58 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
59 "movq %3, %%mm7 \n\t" // mm7 = 0x7F
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
60 "movq %4, %%mm6 \n\t" // mm6 = 0x7D
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
61 "movq (%1), %%mm0 \n\t"
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
62 "movq (%%eax), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
63 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
64 "paddb %%mm7, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
65 "pcmpgtb %%mm6, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
66
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
67 "movq (%%eax,%2), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
68 "psubb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
69 "paddb %%mm7, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
70 "pcmpgtb %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
71 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
72
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
73 "movq (%%eax, %2, 2), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
74 "psubb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
75 "paddb %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
76 "pcmpgtb %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
77 "paddb %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
78
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
79 "leal (%%eax, %2, 4), %%eax \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
80
2413
32e733ec8a88 optimizations (+2% speedup)
michael
parents: 2401
diff changeset
81 "movq (%1, %2, 4), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
82 "psubb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
83 "paddb %%mm7, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
84 "pcmpgtb %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
85 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
86
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
87 "movq (%%eax), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
88 "psubb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
89 "paddb %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
90 "pcmpgtb %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
91 "paddb %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
92
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
93 "movq (%%eax, %2), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
94 "psubb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
95 "paddb %%mm7, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
96 "pcmpgtb %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
97 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
98
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
99 "movq (%%eax, %2, 2), %%mm1 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
100 "psubb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
101 "paddb %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
102 "pcmpgtb %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
103 "paddb %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
104
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
105 " \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
106 #ifdef HAVE_MMX2
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
107 "pxor %%mm7, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
108 "psadbw %%mm7, %%mm0 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
109 #else
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
110 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
111 "psrlw $8, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
112 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
113 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
114 "psrlq $16, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
115 "paddb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
116 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
117 "psrlq $32, %%mm0 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
118 "paddb %%mm1, %%mm0 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
119 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
120 "movd %%mm0, %0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
121 : "=r" (numEq)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
122 : "r" (src), "r" (stride), "m" (c->mmxDcOffset), "m" (c->mmxDcThreshold)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
123 : "%eax"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
124 );
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
125 numEq= (-numEq) &0xFF;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
126 return numEq > c->ppMode.flatnessThreshold;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
127 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
128 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
129
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
130 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
131 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
132 #ifdef HAVE_MMX
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
133 int isOk;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
134 src+= stride*3;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
135 asm volatile(
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
136 "movq (%1, %2), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
137 "movq (%1, %2, 8), %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
138 "movq %%mm0, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
139 "psubusb %%mm1, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
140 "psubusb %%mm2, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
141 "por %%mm1, %%mm0 \n\t" // ABS Diff
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
142
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
143 "movq %3, %%mm7 \n\t" // QP,..., QP
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
144 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
145 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
146 "packssdw %%mm0, %%mm0 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
147 "movd %%mm0, %0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
148 : "=r" (isOk)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
149 : "r" (src), "r" (stride), "m" (c->pQPb)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
150 );
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
151 return isOk==0;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
152 #else
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
153 int x;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
154 const int QP= c->QP;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
155 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
156 for(x=0; x<BLOCK_SIZE; x++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
157 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
158 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
159 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
160
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
161 return 1;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
162 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
163 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
164
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
165 /**
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
166 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
167 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
168 */
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
169 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
170 {
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
171 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
172 src+= stride*3;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
173 asm volatile( //"movv %0 %1 %2\n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
174 "movq %2, %%mm0 \n\t" // QP,..., QP
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
175 "pxor %%mm4, %%mm4 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
176
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
177 "movq (%0), %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
178 "movq (%0, %1), %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
179 "movq %%mm5, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
180 "movq %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
181 "psubusb %%mm6, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
182 "psubusb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
183 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
184 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
185 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
186
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
187 "pand %%mm2, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
188 "pandn %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
189 "por %%mm2, %%mm6 \n\t"// First Line to Filter
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
190
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
191 "movq (%0, %1, 8), %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
192 "leal (%0, %1, 4), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
193 "leal (%0, %1, 8), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
194 "subl %1, %%ecx \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
195 "addl %1, %0 \n\t" // %0 points to line 1 not 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
196 "movq (%0, %1, 8), %%mm7 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
197 "movq %%mm5, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
198 "movq %%mm7, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
199 "psubusb %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
200 "psubusb %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
201 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
202 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
203 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
204
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
205 "pand %%mm2, %%mm7 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
206 "pandn %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
207 "por %%mm2, %%mm7 \n\t" // First Line to Filter
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
208
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
209
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
210 // 1 2 3 4 5 6 7 8
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
211 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
212 // 6 4 2 2 1 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
213 // 6 4 4 2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
214 // 6 8 2
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
215
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
216 "movq (%0, %1), %%mm0 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
217 "movq %%mm0, %%mm1 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
218 PAVGB(%%mm6, %%mm0) //1 1 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
219 PAVGB(%%mm6, %%mm0) //3 1 /4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
220
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
221 "movq (%0, %1, 4), %%mm2 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
222 "movq %%mm2, %%mm5 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
223 PAVGB((%%eax), %%mm2) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
224 PAVGB((%0, %1, 2), %%mm2) // 211 /4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
225 "movq %%mm2, %%mm3 \n\t" // 211 /4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
226 "movq (%0), %%mm4 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
227 PAVGB(%%mm4, %%mm3) // 4 211 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
228 PAVGB(%%mm0, %%mm3) //642211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
229 "movq %%mm3, (%0) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
230 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
231 "movq %%mm1, %%mm0 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
232 PAVGB(%%mm6, %%mm0) //1 1 /2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
233 "movq %%mm4, %%mm3 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
234 PAVGB((%0,%1,2), %%mm3) // 1 1 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
235 PAVGB((%%eax,%1,2), %%mm5) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
236 PAVGB((%%eax), %%mm5) // 211 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
237 PAVGB(%%mm5, %%mm3) // 2 2211 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
238 PAVGB(%%mm0, %%mm3) //4242211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
239 "movq %%mm3, (%0,%1) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
240 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
241 PAVGB(%%mm4, %%mm6) //11 /2
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
242 "movq (%%ecx), %%mm0 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
243 PAVGB((%%eax, %1, 2), %%mm0) // 11/2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
244 "movq %%mm0, %%mm3 \n\t" // 11/2
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
245 PAVGB(%%mm1, %%mm0) // 2 11/4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
246 PAVGB(%%mm6, %%mm0) //222 11/8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
247 PAVGB(%%mm2, %%mm0) //22242211/16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
248 "movq (%0, %1, 2), %%mm2 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
249 "movq %%mm0, (%0, %1, 2) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
250 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
251 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
252 PAVGB((%%ecx), %%mm0) // 11 /2
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
253 PAVGB(%%mm0, %%mm6) //11 11 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
254 PAVGB(%%mm1, %%mm4) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
255 PAVGB(%%mm2, %%mm1) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
256 PAVGB(%%mm1, %%mm6) //1122 11 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
257 PAVGB(%%mm5, %%mm6) //112242211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
258 "movq (%%eax), %%mm5 \n\t" // 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
259 "movq %%mm6, (%%eax) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
260 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
261 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
262 PAVGB(%%mm7, %%mm6) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
263 PAVGB(%%mm4, %%mm6) // 11 11 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
264 PAVGB(%%mm3, %%mm6) // 11 2211 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
265 PAVGB(%%mm5, %%mm2) // 11 /2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
266 "movq (%0, %1, 4), %%mm4 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
267 PAVGB(%%mm4, %%mm2) // 112 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
268 PAVGB(%%mm2, %%mm6) // 112242211 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
269 "movq %%mm6, (%0, %1, 4) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
270 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
271 PAVGB(%%mm7, %%mm1) // 11 2 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
272 PAVGB(%%mm4, %%mm5) // 11 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
273 PAVGB(%%mm5, %%mm0) // 11 11 /4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
274 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
275 PAVGB(%%mm6, %%mm1) // 11 4 2 /8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
276 PAVGB(%%mm0, %%mm1) // 11224222 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
277 "movq %%mm1, (%%eax, %1, 2) \n\t" // X
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
278 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
279 PAVGB((%%ecx), %%mm2) // 112 4 /8
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
280 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
281 PAVGB(%%mm0, %%mm6) // 1 1 /2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
282 PAVGB(%%mm7, %%mm6) // 1 12 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
283 PAVGB(%%mm2, %%mm6) // 1122424 /4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
284 "movq %%mm6, (%%ecx) \n\t" // X
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
285 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
286 PAVGB(%%mm7, %%mm5) // 11 2 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
287 PAVGB(%%mm7, %%mm5) // 11 6 /8
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
288
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
289 PAVGB(%%mm3, %%mm0) // 112 /4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
290 PAVGB(%%mm0, %%mm5) // 112246 /16
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
291 "movq %%mm5, (%%eax, %1, 4) \n\t" // X
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
292 "subl %1, %0 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
293
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
294 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
295 : "r" (src), "r" (stride), "m" (c->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
296 : "%eax", "%ecx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
297 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
298 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
299 const int l1= stride;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
300 const int l2= stride + l1;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
301 const int l3= stride + l2;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
302 const int l4= stride + l3;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
303 const int l5= stride + l4;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
304 const int l6= stride + l5;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
305 const int l7= stride + l6;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
306 const int l8= stride + l7;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
307 const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
308 int x;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
309 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
310 for(x=0; x<BLOCK_SIZE; x++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
311 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
312 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
313 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
314
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
315 int sums[9];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
316 sums[0] = first + src[l1];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
317 sums[1] = src[l1] + src[l2];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
318 sums[2] = src[l2] + src[l3];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
319 sums[3] = src[l3] + src[l4];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
320 sums[4] = src[l4] + src[l5];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
321 sums[5] = src[l5] + src[l6];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
322 sums[6] = src[l6] + src[l7];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
323 sums[7] = src[l7] + src[l8];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
324 sums[8] = src[l8] + last;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
325
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
326 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
327 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
328 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
329 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
330 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
331 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
332 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
333 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
334
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
335 src++;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
336 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
337 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
338 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
339
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
340 #if 0
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
341 /**
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
342 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
343 * values are correctly clipped (MMX2)
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
344 * values are wraparound (C)
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
345 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
346 0 8 16 24
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
347 x = 8
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
348 x/2 = 4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
349 x/8 = 1
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
350 1 12 12 23
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
351 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
352 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
353 {
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
354 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
355 src+= stride*3;
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
356 // FIXME rounding
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
357 asm volatile(
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
358 "pxor %%mm7, %%mm7 \n\t" // 0
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
359 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
360 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
361 "leal (%%eax, %1, 4), %%ecx \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
362 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
363 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
364 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
365 "movq %%mm0, %%mm1 \n\t" // QP,..., QP
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
366 "paddusb "MANGLE(b02)", %%mm0 \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
367 "psrlw $2, %%mm0 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
368 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
369 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
370 "movq (%0, %1, 4), %%mm2 \n\t" // line 4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
371 "movq (%%ecx), %%mm3 \n\t" // line 5
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
372 "movq %%mm2, %%mm4 \n\t" // line 4
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
373 "pcmpeqb %%mm5, %%mm5 \n\t" // -1
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
374 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
375 PAVGB(%%mm3, %%mm5)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
376 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
377 "psubusb %%mm3, %%mm4 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
378 "psubusb %%mm2, %%mm3 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
379 "por %%mm3, %%mm4 \n\t" // |l4 - l5|
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
380 "psubusb %%mm0, %%mm4 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
381 "pcmpeqb %%mm7, %%mm4 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
382 "pand %%mm4, %%mm5 \n\t" // d/2
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
383
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
384 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
385 "paddb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
386 // "psubb %%mm6, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
387 "movq %%mm2, (%0,%1, 4) \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
388
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
389 "movq (%%ecx), %%mm2 \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
390 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
391 "psubb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
392 // "psubb %%mm6, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
393 "movq %%mm2, (%%ecx) \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
394
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
395 "paddb %%mm6, %%mm5 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
396 "psrlw $2, %%mm5 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
397 "pand "MANGLE(b3F)", %%mm5 \n\t"
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
398 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
399
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
400 "movq (%%eax, %1, 2), %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
401 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
402 "paddsb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
403 "psubb %%mm6, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
404 "movq %%mm2, (%%eax, %1, 2) \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
405
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
406 "movq (%%ecx, %1), %%mm2 \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
407 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
408 "psubsb %%mm5, %%mm2 \n\t"
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
409 "psubb %%mm6, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
410 "movq %%mm2, (%%ecx, %1) \n\t"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
411
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
412 :
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
413 : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
414 : "%eax", "%ecx"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
415 );
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
416 #else
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
417 const int l1= stride;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
418 const int l2= stride + l1;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
419 const int l3= stride + l2;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
420 const int l4= stride + l3;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
421 const int l5= stride + l4;
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
422 const int l6= stride + l5;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
423 // const int l7= stride + l6;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
424 // const int l8= stride + l7;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
425 // const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
426 int x;
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
427 const int QP15= QP + (QP>>2);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
428 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
429 for(x=0; x<BLOCK_SIZE; x++)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
430 {
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
431 const int v = (src[x+l5] - src[x+l4]);
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
432 if(ABS(v) < QP15)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
433 {
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
434 src[x+l3] +=v>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
435 src[x+l4] +=v>>1;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
436 src[x+l5] -=v>>1;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
437 src[x+l6] -=v>>3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
438
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
439 }
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
440 }
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
441
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
442 #endif
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
443 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
444 #endif
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
445
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
446 /**
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
447 * Experimental Filter 1
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
448 * will not damage linear gradients
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
449 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
450 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
451 * MMX2 version does correct clipping C version doesnt
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
452 */
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
453 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
454 {
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
455 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
456 src+= stride*3;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
457
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
458 asm volatile(
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
459 "pxor %%mm7, %%mm7 \n\t" // 0
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
460 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
461 "leal (%%eax, %1, 4), %%ecx \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
462 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
463 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
464 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
465 "movq (%0, %1, 4), %%mm1 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
466 "movq %%mm1, %%mm2 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
467 "psubusb %%mm0, %%mm1 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
468 "psubusb %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
469 "por %%mm1, %%mm0 \n\t" // |l2 - l3|
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
470 "movq (%%ecx), %%mm3 \n\t" // line 5
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
471 "movq (%%ecx, %1), %%mm4 \n\t" // line 6
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
472 "movq %%mm3, %%mm5 \n\t" // line 5
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
473 "psubusb %%mm4, %%mm3 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
474 "psubusb %%mm5, %%mm4 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
475 "por %%mm4, %%mm3 \n\t" // |l5 - l6|
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
476 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
477 "movq %%mm2, %%mm1 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
478 "psubusb %%mm5, %%mm2 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
479 "movq %%mm2, %%mm4 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
480 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
481 "psubusb %%mm1, %%mm5 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
482 "por %%mm5, %%mm4 \n\t" // |l4 - l5|
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
483 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
484 "movq %%mm4, %%mm3 \n\t" // d
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
485 "movq %2, %%mm0 \n\t"
5787
5c36f7890b53 x1 deblocking filter bugfix
michael
parents: 4403
diff changeset
486 "paddusb %%mm0, %%mm0 \n\t"
5c36f7890b53 x1 deblocking filter bugfix
michael
parents: 4403
diff changeset
487 "psubusb %%mm0, %%mm4 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
488 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
489 "psubusb "MANGLE(b01)", %%mm3 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
490 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
491
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
492 PAVGB(%%mm7, %%mm3) // d/2
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
493 "movq %%mm3, %%mm1 \n\t" // d/2
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
494 PAVGB(%%mm7, %%mm3) // d/4
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
495 PAVGB(%%mm1, %%mm3) // 3*d/8
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
496
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
497 "movq (%0, %1, 4), %%mm0 \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
498 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
499 "psubusb %%mm3, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
500 "pxor %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
501 "movq %%mm0, (%0, %1, 4) \n\t" // line 4
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
502
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
503 "movq (%%ecx), %%mm0 \n\t" // line 5
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
504 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
505 "paddusb %%mm3, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
506 "pxor %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
507 "movq %%mm0, (%%ecx) \n\t" // line 5
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
508
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
509 PAVGB(%%mm7, %%mm1) // d/4
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
510
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
512 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
513 "psubusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
514 "pxor %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
515 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
516
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
517 "movq (%%ecx, %1), %%mm0 \n\t" // line 6
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
518 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
519 "paddusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
520 "pxor %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
521 "movq %%mm0, (%%ecx, %1) \n\t" // line 6
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
522
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
523 PAVGB(%%mm7, %%mm1) // d/8
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
524
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
525 "movq (%%eax, %1), %%mm0 \n\t" // line 2
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
526 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
527 "psubusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
528 "pxor %%mm2, %%mm0 \n\t"
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
529 "movq %%mm0, (%%eax, %1) \n\t" // line 2
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
530
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
531 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
532 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
533 "paddusb %%mm1, %%mm0 \n\t"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
534 "pxor %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
535 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
536
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
537 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
538 : "r" (src), "r" (stride), "m" (co->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
539 : "%eax", "%ecx"
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
540 );
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
541 #else
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
542
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
543 const int l1= stride;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
544 const int l2= stride + l1;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
545 const int l3= stride + l2;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
546 const int l4= stride + l3;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
547 const int l5= stride + l4;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
548 const int l6= stride + l5;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
549 const int l7= stride + l6;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
550 // const int l8= stride + l7;
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
551 // const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
552 int x;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
553
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
554 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
555 for(x=0; x<BLOCK_SIZE; x++)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
556 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
557 int a= src[l3] - src[l4];
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
558 int b= src[l4] - src[l5];
2179
2d8d14b882cc fixed a rounding bug thing in the X1 Filter
michael
parents: 2169
diff changeset
559 int c= src[l5] - src[l6];
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
560
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
561 int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
562 d= MAX(d, 0);
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
563
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
564 if(d < co->QP*2)
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
565 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
566 int v = d * SIGN(-b);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
567
2586
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
568 src[l2] +=v>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
569 src[l3] +=v>>2;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
570 src[l4] +=(3*v)>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
571 src[l5] -=(3*v)>>3;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
572 src[l6] -=v>>2;
3b05a6b4d870 c speedup (x1, rk1 filters)
michael
parents: 2570
diff changeset
573 src[l7] -=v>>3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
574
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
575 }
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
576 src++;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
577 }
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
578 #endif
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
579 }
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
580
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
581 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
582 {
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
584 /*
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
585 uint8_t tmp[16];
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
586 const int l1= stride;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
587 const int l2= stride + l1;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
588 const int l3= stride + l2;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
589 const int l4= (int)tmp - (int)src - stride*3;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
590 const int l5= (int)tmp - (int)src - stride*3 + 8;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
591 const int l6= stride*3 + l3;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
592 const int l7= stride + l6;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
593 const int l8= stride + l7;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
594
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
595 memcpy(tmp, src+stride*7, 8);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
596 memcpy(tmp+8, src+stride*8, 8);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
597 */
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
598 src+= stride*4;
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
599 asm volatile(
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
600
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
601 #if 0 //sligtly more accurate and slightly slower
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
602 "pxor %%mm7, %%mm7 \n\t" // 0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
603 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
604 "leal (%%eax, %1, 4), %%ecx \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
605 // 0 1 2 3 4 5 6 7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
606 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
607 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
608
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
609
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
610 "movq (%0, %1, 2), %%mm0 \n\t" // l2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
611 "movq (%0), %%mm1 \n\t" // l0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
612 "movq %%mm0, %%mm2 \n\t" // l2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
613 PAVGB(%%mm7, %%mm0) // ~l2/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
614 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
615 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
616
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
617 "movq (%%eax), %%mm1 \n\t" // l1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
618 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
619 "movq %%mm1, %%mm4 \n\t" // l1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
620 PAVGB(%%mm7, %%mm1) // ~l1/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
621 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
622 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
623
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
624 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
625 "psubusb %%mm1, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
626 "psubusb %%mm4, %%mm1 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
627 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
628 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
629
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
630 "movq (%0, %1, 4), %%mm0 \n\t" // l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
631 "movq %%mm0, %%mm4 \n\t" // l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
632 PAVGB(%%mm7, %%mm0) // ~l4/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
633 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
634 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
635
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
636 "movq (%%ecx), %%mm2 \n\t" // l5
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
637 "movq %%mm3, %%mm5 \n\t" // l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
638 PAVGB(%%mm7, %%mm3) // ~l3/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
639 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
640 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
641
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
642 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
643 "psubusb %%mm3, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
644 "psubusb %%mm6, %%mm3 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
645 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
646 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
647 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
648
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
649 "movq (%%ecx, %1), %%mm6 \n\t" // l6
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
650 "movq %%mm6, %%mm5 \n\t" // l6
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
651 PAVGB(%%mm7, %%mm6) // ~l6/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
652 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
653 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
654
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
655 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
656 "movq %%mm2, %%mm4 \n\t" // l5
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
657 PAVGB(%%mm7, %%mm2) // ~l5/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
658 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
659 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
660
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
661 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
662 "psubusb %%mm2, %%mm6 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
663 "psubusb %%mm4, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
664 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
665 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
666
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
667
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
668 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
669 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
670 "paddusb "MANGLE(b01)", %%mm4 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
671 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
672 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
673 "pand %%mm4, %%mm3 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
674
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
675 "movq %%mm3, %%mm1 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
676 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
677 PAVGB(%%mm7, %%mm3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
678 PAVGB(%%mm7, %%mm3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
679 "paddusb %%mm1, %%mm3 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
680 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
681
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
682 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
683 "movq (%0, %1, 4), %%mm5 \n\t" //l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
684 "movq (%0, %1, 4), %%mm4 \n\t" //l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
685 "psubusb %%mm6, %%mm5 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
686 "psubusb %%mm4, %%mm6 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
687 "por %%mm6, %%mm5 \n\t" // |l3-l4|
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
688 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
689 "pxor %%mm6, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
690 "pand %%mm0, %%mm3 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
691 PMINUB(%%mm5, %%mm3, %%mm0)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
692
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
693 "psubusb "MANGLE(b01)", %%mm3 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
694 PAVGB(%%mm7, %%mm3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
695
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
696 "movq (%%eax, %1, 2), %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
697 "movq (%0, %1, 4), %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
698 "pxor %%mm6, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
699 "pxor %%mm6, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
700 "psubb %%mm3, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
701 "paddb %%mm3, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
702 "pxor %%mm6, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
703 "pxor %%mm6, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
704 "movq %%mm0, (%%eax, %1, 2) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
705 "movq %%mm2, (%0, %1, 4) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
706 #endif
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
707
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
708 "leal (%0, %1), %%eax \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
709 "pcmpeqb %%mm6, %%mm6 \n\t" // -1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
710 // 0 1 2 3 4 5 6 7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
711 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
712 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
713
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
714
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
715 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
716 "movq (%0, %1, 4), %%mm0 \n\t" // l4
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
717 "pxor %%mm6, %%mm1 \n\t" // -l3-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
718 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
719 // mm1=-l3-1, mm0=128-q
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
720
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
721 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
722 "movq (%%eax, %1), %%mm3 \n\t" // l2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
723 "pxor %%mm6, %%mm2 \n\t" // -l5-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
724 "movq %%mm2, %%mm5 \n\t" // -l5-1
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
725 "movq "MANGLE(b80)", %%mm4 \n\t" // 128
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
726 "leal (%%eax, %1, 4), %%ecx \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
727 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
728 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
729 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
730 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
731 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
732
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
733 "movq (%%eax), %%mm2 \n\t" // l1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
734 "pxor %%mm6, %%mm2 \n\t" // -l1-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
735 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
736 PAVGB((%0), %%mm1) // (l0-l3+256)/2
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
737 "movq "MANGLE(b80)", %%mm3 \n\t" // 128
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
738 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
739 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
740 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
741 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
742
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
743 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
744 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
745 "pxor %%mm6, %%mm1 \n\t" // -l7-1
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
746 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
747 "movq "MANGLE(b80)", %%mm2 \n\t" // 128
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
748 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
749 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
750 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
751 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
752
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
753 "movq "MANGLE(b00)", %%mm1 \n\t" // 0
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
754 "movq "MANGLE(b00)", %%mm5 \n\t" // 0
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
755 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
756 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
757 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
758 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
759 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
760
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
761 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
762
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
763 "movq "MANGLE(b00)", %%mm7 \n\t" // 0
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
764 "movq %2, %%mm2 \n\t" // QP
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
765 PAVGB(%%mm6, %%mm2) // 128 + QP/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
766 "psubb %%mm6, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
767
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
768 "movq %%mm4, %%mm1 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
769 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
770 "pxor %%mm1, %%mm4 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
771 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
772 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
773 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
774 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
775
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
776 "movq %%mm4, %%mm3 \n\t" // d
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
777 "psubusb "MANGLE(b01)", %%mm4 \n\t"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
778 PAVGB(%%mm7, %%mm4) // d/32
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
779 PAVGB(%%mm7, %%mm4) // (d + 32)/64
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
780 "paddb %%mm3, %%mm4 \n\t" // 5d/64
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
781 "pand %%mm2, %%mm4 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
782
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
783 "movq "MANGLE(b80)", %%mm5 \n\t" // 128
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
784 "psubb %%mm0, %%mm5 \n\t" // q
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
785 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
786 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
787 "pxor %%mm7, %%mm5 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
788
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
789 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
790 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
791
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
792 "pand %%mm7, %%mm4 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
793 "movq (%%eax, %1, 2), %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
794 "movq (%0, %1, 4), %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
795 "pxor %%mm1, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
796 "pxor %%mm1, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
797 "paddb %%mm4, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
798 "psubb %%mm4, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
799 "pxor %%mm1, %%mm0 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
800 "pxor %%mm1, %%mm2 \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
801 "movq %%mm0, (%%eax, %1, 2) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
802 "movq %%mm2, (%0, %1, 4) \n\t"
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
803
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
804 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
805 : "r" (src), "r" (stride), "m" (c->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
806 : "%eax", "%ecx"
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
807 );
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
808
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
809 /*
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
810 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
811 int x;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
812 src-= stride;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
813 for(x=0; x<BLOCK_SIZE; x++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
814 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
815 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
816 if(ABS(middleEnergy)< 8*QP)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
817 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
818 const int q=(src[l4] - src[l5])/2;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
819 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
820 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
821
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
822 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
823 d= MAX(d, 0);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
824
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
825 d= (5*d + 32) >> 6;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
826 d*= SIGN(-middleEnergy);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
827
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
828 if(q>0)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
829 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
830 d= d<0 ? 0 : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
831 d= d>q ? q : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
832 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
833 else
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
834 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
835 d= d>0 ? 0 : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
836 d= d<q ? q : d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
837 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
838
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
839 src[l4]-= d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
840 src[l5]+= d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
841 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
842 src++;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
843 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
844 src-=8;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
845 for(x=0; x<8; x++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
846 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
847 int y;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
848 for(y=4; y<6; y++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
849 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
850 int d= src[x+y*stride] - tmp[x+(y-4)*8];
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
851 int ad= ABS(d);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
852 static int max=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
853 static int sum=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
854 static int num=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
855 static int bias=0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
856
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
857 if(max<ad) max=ad;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
858 sum+= ad>3 ? 1 : 0;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
859 if(ad>3)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
860 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
861 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
862 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
863 if(y==4) bias+=d;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
864 num++;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
865 if(num%1000000 == 0)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
866 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
867 printf(" %d %d %d %d\n", num, sum, max, bias);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
868 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
869 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
870 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
871 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
872 */
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
873 #elif defined (HAVE_MMX)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
874 src+= stride*4;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
875
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
876 asm volatile(
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
877 "pxor %%mm7, %%mm7 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
878 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
879 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
880 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
881 "andl $0xFFFFFFF8, %%ecx \n\t" // align
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
882 // 0 1 2 3 4 5 6 7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
883 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
884 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
885
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
886 "movq (%0), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
887 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
888 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
889 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
890
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
891 "movq (%%eax), %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
892 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
893 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
894 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
895
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
896 "movq (%%eax, %1), %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
897 "movq %%mm4, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
898 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
899 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
900
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
901 "paddw %%mm0, %%mm0 \n\t" // 2L0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
902 "paddw %%mm1, %%mm1 \n\t" // 2H0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
903 "psubw %%mm4, %%mm2 \n\t" // L1 - L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
904 "psubw %%mm5, %%mm3 \n\t" // H1 - H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
905 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
906 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
907
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
908 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
909 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
910 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
911 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
912
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
913 "movq (%%eax, %1, 2), %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
914 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
915 "punpcklbw %%mm7, %%mm2 \n\t" // L3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
916 "punpckhbw %%mm7, %%mm3 \n\t" // H3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
917
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
918 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
919 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
920 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
921 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
922 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
923 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
924
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
925 "movq (%0, %1, 4), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
926 "movq %%mm0, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
927 "punpcklbw %%mm7, %%mm0 \n\t" // L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
928 "punpckhbw %%mm7, %%mm1 \n\t" // H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
929
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
930 "psubw %%mm0, %%mm2 \n\t" // L3 - L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
931 "psubw %%mm1, %%mm3 \n\t" // H3 - H4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
932 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
933 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
934 "paddw %%mm4, %%mm4 \n\t" // 2L2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
935 "paddw %%mm5, %%mm5 \n\t" // 2H2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
936 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
937 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
938
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
939 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
940 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
941 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
942 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
943 //50 opcodes so far
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
944 "movq (%%edx), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
945 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
946 "punpcklbw %%mm7, %%mm2 \n\t" // L5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
947 "punpckhbw %%mm7, %%mm3 \n\t" // H5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
950 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
951 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
952
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
953 "movq (%%edx, %1), %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
954 "punpcklbw %%mm7, %%mm6 \n\t" // L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
955 "psubw %%mm6, %%mm2 \n\t" // L5 - L6
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
956 "movq (%%edx, %1), %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
957 "punpckhbw %%mm7, %%mm6 \n\t" // H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
958 "psubw %%mm6, %%mm3 \n\t" // H5 - H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
959
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
960 "paddw %%mm0, %%mm0 \n\t" // 2L4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
961 "paddw %%mm1, %%mm1 \n\t" // 2H4
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
962 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
963 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
964
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
965 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
966 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
967 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
969
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
970 "movq (%%edx, %1, 2), %%mm2 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
971 "movq %%mm2, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
972 "punpcklbw %%mm7, %%mm2 \n\t" // L7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
973 "punpckhbw %%mm7, %%mm3 \n\t" // H7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
974
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
975 "paddw %%mm2, %%mm2 \n\t" // 2L7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
976 "paddw %%mm3, %%mm3 \n\t" // 2H7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
977 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
978 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
979
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
980 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
981 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
982
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
983 #ifdef HAVE_MMX2
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
984 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
985 "psubw %%mm0, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
986 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
987 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
988 "psubw %%mm1, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
989 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
990 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
991 "psubw %%mm2, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
992 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
993 "movq %%mm7, %%mm6 \n\t" // 0
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
994 "psubw %%mm3, %%mm6 \n\t"
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
995 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
996 #else
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
997 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
998 "pcmpgtw %%mm0, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
999 "pxor %%mm6, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1000 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1001 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1002 "pcmpgtw %%mm1, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1003 "pxor %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1004 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1005 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1006 "pcmpgtw %%mm2, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1007 "pxor %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1008 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1009 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1010 "pcmpgtw %%mm3, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1011 "pxor %%mm6, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1012 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1013 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1014
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1015 #ifdef HAVE_MMX2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1016 "pminsw %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1017 "pminsw %%mm3, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1018 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1019 "movq %%mm0, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1020 "psubusw %%mm2, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1021 "psubw %%mm6, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1022 "movq %%mm1, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1023 "psubusw %%mm3, %%mm6 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1024 "psubw %%mm6, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1025 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1026
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1027 "movq %%mm7, %%mm6 \n\t" // 0
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1028 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1029 "pxor %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1030 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1031 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1032 "pxor %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1033 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1034 // 100 opcodes
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1035 "movd %2, %%mm2 \n\t" // QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1036 "psllw $3, %%mm2 \n\t" // 8QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1037 "movq %%mm2, %%mm3 \n\t" // 8QP
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1038 "pcmpgtw %%mm4, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1039 "pcmpgtw %%mm5, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1040 "pand %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1041 "pand %%mm3, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1042
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1043
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1044 "psubusw %%mm0, %%mm4 \n\t" // hd
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1045 "psubusw %%mm1, %%mm5 \n\t" // ld
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1046
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1047
4253
4b39bde9f7ad fix mangling with runtime cpu detection
atmos4
parents: 4248
diff changeset
1048 "movq "MANGLE(w05)", %%mm2 \n\t" // 5
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1049 "pmullw %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1050 "pmullw %%mm2, %%mm5 \n\t"
4253
4b39bde9f7ad fix mangling with runtime cpu detection
atmos4
parents: 4248
diff changeset
1051 "movq "MANGLE(w20)", %%mm2 \n\t" // 32
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1052 "paddw %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1053 "paddw %%mm2, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1054 "psrlw $6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1055 "psrlw $6, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1056
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1057 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1058 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1059
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1060 "pxor %%mm2, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1061 "pxor %%mm3, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1062
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1063 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1064 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1065 "pxor %%mm2, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1066 "pxor %%mm3, %%mm1 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1067 "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1068 "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1069 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1070 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1071
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1072 "pxor %%mm6, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1073 "pxor %%mm7, %%mm3 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1074 "pand %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1075 "pand %%mm3, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1076
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1077 #ifdef HAVE_MMX2
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1078 "pminsw %%mm0, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1079 "pminsw %%mm1, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1080 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1081 "movq %%mm4, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1082 "psubusw %%mm0, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1083 "psubw %%mm2, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1084 "movq %%mm5, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1085 "psubusw %%mm1, %%mm2 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1086 "psubw %%mm2, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1087 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1088 "pxor %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1089 "pxor %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1090 "psubw %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1091 "psubw %%mm7, %%mm5 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1092 "packsswb %%mm5, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1093 "movq (%%eax, %1, 2), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1094 "paddb %%mm4, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1095 "movq %%mm0, (%%eax, %1, 2) \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1096 "movq (%0, %1, 4), %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1097 "psubb %%mm4, %%mm0 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1098 "movq %%mm0, (%0, %1, 4) \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1099
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1100 :
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1101 : "r" (src), "r" (stride), "m" (c->pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1102 : "%eax", "%edx", "%ecx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1103 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1104 #else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1105 const int l1= stride;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1106 const int l2= stride + l1;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1107 const int l3= stride + l2;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1108 const int l4= stride + l3;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1109 const int l5= stride + l4;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1110 const int l6= stride + l5;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1111 const int l7= stride + l6;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1112 const int l8= stride + l7;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1113 // const int l9= stride + l8;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
1114 int x;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1115 src+= stride*3;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
1116 for(x=0; x<BLOCK_SIZE; x++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1117 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1118 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1119 if(ABS(middleEnergy) < 8*c->QP)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1120 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1121 const int q=(src[l4] - src[l5])/2;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1122 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1123 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1124
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1125 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1126 d= MAX(d, 0);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1127
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1128 d= (5*d + 32) >> 6;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1129 d*= SIGN(-middleEnergy);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1130
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1131 if(q>0)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1132 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1133 d= d<0 ? 0 : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1134 d= d>q ? q : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1135 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1136 else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1137 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1138 d= d>0 ? 0 : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1139 d= d<q ? q : d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1140 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1141
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1142 src[l4]-= d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1143 src[l5]+= d;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1144 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1145 src++;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1146 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1147 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1148 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1149
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1150 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1151 {
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1152 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1153 asm volatile(
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1154 "pxor %%mm6, %%mm6 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1155 "pcmpeqb %%mm7, %%mm7 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1156 "movq %2, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1157 "punpcklbw %%mm6, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1158 "psrlw $1, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1159 "psubw %%mm7, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1160 "packuswb %%mm0, %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1161 "movq %%mm0, %3 \n\t"
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1162
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1163 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1164 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1165
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1166 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1167 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1168
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1169 #undef FIND_MIN_MAX
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1170 #ifdef HAVE_MMX2
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1171 #define FIND_MIN_MAX(addr)\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1172 "movq " #addr ", %%mm0 \n\t"\
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1173 "pminub %%mm0, %%mm7 \n\t"\
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1174 "pmaxub %%mm0, %%mm6 \n\t"
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1175 #else
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1176 #define FIND_MIN_MAX(addr)\
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1177 "movq " #addr ", %%mm0 \n\t"\
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1178 "movq %%mm7, %%mm1 \n\t"\
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1179 "psubusb %%mm0, %%mm6 \n\t"\
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1180 "paddb %%mm0, %%mm6 \n\t"\
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1181 "psubusb %%mm0, %%mm1 \n\t"\
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1182 "psubb %%mm1, %%mm7 \n\t"
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1183 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1184
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1185 FIND_MIN_MAX((%%eax))
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1186 FIND_MIN_MAX((%%eax, %1))
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1187 FIND_MIN_MAX((%%eax, %1, 2))
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1188 FIND_MIN_MAX((%0, %1, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1189 FIND_MIN_MAX((%%edx))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1190 FIND_MIN_MAX((%%edx, %1))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1191 FIND_MIN_MAX((%%edx, %1, 2))
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1192 FIND_MIN_MAX((%0, %1, 8))
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1193
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1194 "movq %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1195 "psrlq $8, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1196 #ifdef HAVE_MMX2
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1198 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1199 "pminub %%mm4, %%mm7 \n\t" // min of pixels
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1200 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1201 "pminub %%mm4, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1202 #else
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1203 "movq %%mm7, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1204 "psubusb %%mm4, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1205 "psubb %%mm1, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1206 "movq %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1207 "psrlq $16, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1208 "movq %%mm7, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1209 "psubusb %%mm4, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1210 "psubb %%mm1, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1211 "movq %%mm7, %%mm4 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1212 "psrlq $32, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1213 "movq %%mm7, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1214 "psubusb %%mm4, %%mm1 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1215 "psubb %%mm1, %%mm7 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1216 #endif
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1217
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1218
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1219 "movq %%mm6, %%mm4 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1220 "psrlq $8, %%mm6 \n\t"
2475
3369845d92f4 3dnow dering
michael
parents: 2473
diff changeset
1221 #ifdef HAVE_MMX2
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1222 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1223 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1224 "pmaxub %%mm4, %%mm6 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1225 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1226 "pmaxub %%mm4, %%mm6 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1227 #else
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1228 "psubusb %%mm4, %%mm6 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1229 "paddb %%mm4, %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1230 "movq %%mm6, %%mm4 \n\t"
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1231 "psrlq $16, %%mm6 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1232 "psubusb %%mm4, %%mm6 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1233 "paddb %%mm4, %%mm6 \n\t"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1234 "movq %%mm6, %%mm4 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1235 "psrlq $32, %%mm6 \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1236 "psubusb %%mm4, %%mm6 \n\t"
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1237 "paddb %%mm4, %%mm6 \n\t"
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1238 #endif
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1239 "movq %%mm6, %%mm0 \n\t" // max
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1240 "psubb %%mm7, %%mm6 \n\t" // max - min
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1241 "movd %%mm6, %%ecx \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1242 "cmpb "MANGLE(deringThreshold)", %%cl \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1243 " jb 1f \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1244 "leal -24(%%esp), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1245 "andl $0xFFFFFFF8, %%ecx \n\t"
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1246 PAVGB(%%mm0, %%mm7) // a=(max + min)/2
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1247 "punpcklbw %%mm7, %%mm7 \n\t"
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1248 "punpcklbw %%mm7, %%mm7 \n\t"
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1249 "punpcklbw %%mm7, %%mm7 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1250 "movq %%mm7, (%%ecx) \n\t"
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1251
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1252 "movq (%0), %%mm0 \n\t" // L10
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1253 "movq %%mm0, %%mm1 \n\t" // L10
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1254 "movq %%mm0, %%mm2 \n\t" // L10
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1255 "psllq $8, %%mm1 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1256 "psrlq $8, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1257 "movd -4(%0), %%mm3 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1258 "movd 8(%0), %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1259 "psrlq $24, %%mm3 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1260 "psllq $56, %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1261 "por %%mm3, %%mm1 \n\t" // L00
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1262 "por %%mm4, %%mm2 \n\t" // L20
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1263 "movq %%mm1, %%mm3 \n\t" // L00
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1264 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1265 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1266 "psubusb %%mm7, %%mm0 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1267 "psubusb %%mm7, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1268 "psubusb %%mm7, %%mm3 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1269 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1270 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1271 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1272 "paddb %%mm2, %%mm0 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1273 "paddb %%mm3, %%mm0 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1274
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1275 "movq (%%eax), %%mm2 \n\t" // L11
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1276 "movq %%mm2, %%mm3 \n\t" // L11
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1277 "movq %%mm2, %%mm4 \n\t" // L11
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1278 "psllq $8, %%mm3 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1279 "psrlq $8, %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1280 "movd -4(%%eax), %%mm5 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1281 "movd 8(%%eax), %%mm6 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1282 "psrlq $24, %%mm5 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1283 "psllq $56, %%mm6 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1284 "por %%mm5, %%mm3 \n\t" // L01
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1285 "por %%mm6, %%mm4 \n\t" // L21
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1286 "movq %%mm3, %%mm5 \n\t" // L01
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1287 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1288 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1289 "psubusb %%mm7, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1290 "psubusb %%mm7, %%mm4 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1291 "psubusb %%mm7, %%mm5 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1292 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1293 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1294 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1295 "paddb %%mm4, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1296 "paddb %%mm5, %%mm2 \n\t"
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1297 // 0, 2, 3, 1
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1298 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1299 "movq " #src ", " #sx " \n\t" /* src[0] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1300 "movq " #sx ", " #lx " \n\t" /* src[0] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1301 "movq " #sx ", " #t0 " \n\t" /* src[0] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1302 "psllq $8, " #lx " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1303 "psrlq $8, " #t0 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1304 "movd -4" #src ", " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1305 "psrlq $24, " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1306 "por " #t1 ", " #lx " \n\t" /* src[-1] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1307 "movd 8" #src ", " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1308 "psllq $56, " #t1 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1309 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1310 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1311 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1312 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
2478
42d5846eeb51 faster dering
michael
parents: 2477
diff changeset
1313 PAVGB(lx, pplx) \
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1314 "movq " #lx ", 8(%%ecx) \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1315 "movq (%%ecx), " #lx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1316 "psubusb " #lx ", " #t1 " \n\t"\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1317 "psubusb " #lx ", " #t0 " \n\t"\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1318 "psubusb " #lx ", " #sx " \n\t"\
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1319 "movq "MANGLE(b00)", " #lx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1320 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1321 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1322 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1323 "paddb " #t1 ", " #t0 " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1324 "paddb " #t0 ", " #sx " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1325 \
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1326 PAVGB(plx, pplx) /* filtered */\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1327 "movq " #dst ", " #t0 " \n\t" /* dst */\
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1328 "movq " #t0 ", " #t1 " \n\t" /* dst */\
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1329 "psubusb %3, " #t0 " \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1330 "paddusb %3, " #t1 " \n\t"\
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1331 PMAXUB(t0, pplx)\
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1332 PMINUB(t1, pplx, t0)\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1333 "paddb " #sx ", " #ppsx " \n\t"\
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1334 "paddb " #psx ", " #ppsx " \n\t"\
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1335 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
1336 "pand "MANGLE(b08)", " #ppsx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1337 "pcmpeqb " #lx ", " #ppsx " \n\t"\
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1338 "pand " #ppsx ", " #pplx " \n\t"\
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1339 "pandn " #dst ", " #ppsx " \n\t"\
2570
af43a83122fc minor speedup
michael
parents: 2478
diff changeset
1340 "por " #pplx ", " #ppsx " \n\t"\
2478
42d5846eeb51 faster dering
michael
parents: 2477
diff changeset
1341 "movq " #ppsx ", " #dst " \n\t"\
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1342 "movq 8(%%ecx), " #lx " \n\t"
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1343
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1344 /*
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1345 0000000
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1346 1111111
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1347
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1348 1111110
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1349 1111101
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1350 1111100
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1351 1111011
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1352 1111010
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1353 1111001
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1354
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1355 1111000
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1356 1110111
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1357
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1358 */
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1359 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1360 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1361 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
1362 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1363 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1364 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1365 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1366 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1367 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1368
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1369 "1: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1370 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1371 : "%eax", "%edx", "%ecx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1372 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1373 #else
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1374 int y;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1375 int min=255;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1376 int max=0;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1377 int avg;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1378 uint8_t *p;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1379 int s[10];
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1380 const int QP2= c->QP/2 + 1;
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1381
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1382 for(y=1; y<9; y++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1383 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1384 int x;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1385 p= src + stride*y;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1386 for(x=1; x<9; x++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1387 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1388 p++;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1389 if(*p > max) max= *p;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1390 if(*p < min) min= *p;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1391 }
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1392 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1393 avg= (min + max + 1)>>1;
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1394
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1395 if(max - min <deringThreshold) return;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1396
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1397 for(y=0; y<10; y++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1398 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1399 int t = 0;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1400
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1401 if(src[stride*y + 0] > avg) t+= 1;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1402 if(src[stride*y + 1] > avg) t+= 2;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1403 if(src[stride*y + 2] > avg) t+= 4;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1404 if(src[stride*y + 3] > avg) t+= 8;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1405 if(src[stride*y + 4] > avg) t+= 16;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1406 if(src[stride*y + 5] > avg) t+= 32;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1407 if(src[stride*y + 6] > avg) t+= 64;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1408 if(src[stride*y + 7] > avg) t+= 128;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1409 if(src[stride*y + 8] > avg) t+= 256;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1410 if(src[stride*y + 9] > avg) t+= 512;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1411
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1412 t |= (~t)<<16;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1413 t &= (t<<1) & (t>>1);
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1414 s[y] = t;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1415 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1416
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1417 for(y=1; y<9; y++)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1418 {
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1419 int t = s[y-1] & s[y] & s[y+1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1420 t|= t>>16;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1421 s[y-1]= t;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1422 }
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1423
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1424 for(y=1; y<9; y++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1425 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1426 int x;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1427 int t = s[y-1];
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1428
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1429 p= src + stride*y;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1430 for(x=1; x<9; x++)
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1431 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1432 p++;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1433 if(t & (1<<x))
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1434 {
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1435 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1436 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1437 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1438 f= (f + 8)>>4;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1439
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1440 #ifdef DEBUG_DERING_THRESHOLD
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1441 asm volatile("emms\n\t":);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1442 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1443 static long long numPixels=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1444 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1445 // if((max-min)<20 || (max-min)*QP<200)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1446 // if((max-min)*QP < 500)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1447 // if(max-min<QP/2)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1448 if(max-min < 20)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1449 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1450 static int numSkiped=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1451 static int errorSum=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1452 static int worstQP=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1453 static int worstRange=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1454 static int worstDiff=0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1455 int diff= (f - *p);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1456 int absDiff= ABS(diff);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1457 int error= diff*diff;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1458
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1459 if(x==1 || x==8 || y==1 || y==8) continue;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1460
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1461 numSkiped++;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1462 if(absDiff > worstDiff)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1463 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1464 worstDiff= absDiff;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1465 worstQP= QP;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1466 worstRange= max-min;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1467 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1468 errorSum+= error;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1469
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1470 if(1024LL*1024LL*1024LL % numSkiped == 0)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1471 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1472 printf( "sum:%1.3f, skip:%d, wQP:%d, "
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1473 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1474 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1475 worstDiff, (float)numSkiped/numPixels);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1476 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1477 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1478 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1479 #endif
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1480 if (*p + QP2 < f) *p= *p + QP2;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1481 else if(*p - QP2 > f) *p= *p - QP2;
2477
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1482 else *p=f;
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1483 }
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1484 }
830c0e171d7d dering in c
michael
parents: 2476
diff changeset
1485 }
3093
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1486 #ifdef DEBUG_DERING_THRESHOLD
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1487 if(max-min < 20)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1488 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1489 for(y=1; y<9; y++)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1490 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1491 int x;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1492 int t = 0;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1493 p= src + stride*y;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1494 for(x=1; x<9; x++)
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1495 {
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1496 p++;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1497 *p = MIN(*p + 20, 255);
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1498 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1499 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1500 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1501 }
fb4cee33d3c6 faster dering
michael
parents: 3037
diff changeset
1502 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1503 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1504 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
1505
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1506 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1507 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1508 * will be called for every 8x8 block and can read & write from line 4-15
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1509 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1510 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1511 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1512 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1513 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1514 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1515 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1516 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1517 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1518 "leal (%%eax, %1, 4), %%ecx \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1519 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1520 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1521
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1522 "movq (%0), %%mm0 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1523 "movq (%%eax, %1), %%mm1 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1524 PAVGB(%%mm1, %%mm0)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1525 "movq %%mm0, (%%eax) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1526 "movq (%0, %1, 4), %%mm0 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1527 PAVGB(%%mm0, %%mm1)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1528 "movq %%mm1, (%%eax, %1, 2) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1529 "movq (%%ecx, %1), %%mm1 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1530 PAVGB(%%mm1, %%mm0)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1531 "movq %%mm0, (%%ecx) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1532 "movq (%0, %1, 8), %%mm0 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1533 PAVGB(%%mm0, %%mm1)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1534 "movq %%mm1, (%%ecx, %1, 2) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1535
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1536 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1537 : "%eax", "%ecx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1538 );
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1539 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1540 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1541 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1542 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1543 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1544 src[stride] = (src[0] + src[stride*2])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1545 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1546 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1547 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1548 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1549 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1550 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1551 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1552
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1553 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1554 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1555 * will be called for every 8x8 block and can read & write from line 4-15
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1556 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1557 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1558 * this filter will read lines 3-15 and write 7-13
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1559 * no cliping in C version
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1560 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1561 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1562 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1563 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1564 src+= stride*3;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1565 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1566 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1567 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1568 "leal (%%edx, %1, 4), %%ecx \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1569 "addl %1, %%ecx \n\t"
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1570 "pxor %%mm7, %%mm7 \n\t"
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1571 // 0 1 2 3 4 5 6 7 8 9 10
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1572 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1573
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1574 #define DEINT_CUBIC(a,b,c,d,e)\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1575 "movq " #a ", %%mm0 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1576 "movq " #b ", %%mm1 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1577 "movq " #d ", %%mm2 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1578 "movq " #e ", %%mm3 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1579 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1580 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1581 "movq %%mm0, %%mm2 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1582 "punpcklbw %%mm7, %%mm0 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1583 "punpckhbw %%mm7, %%mm2 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1584 "movq %%mm1, %%mm3 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1585 "punpcklbw %%mm7, %%mm1 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1586 "punpckhbw %%mm7, %%mm3 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1587 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1588 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1589 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1590 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1591 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1592 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1593 "packuswb %%mm3, %%mm1 \n\t"\
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1594 "movq %%mm1, " #c " \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1595
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1596 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1597 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1598 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1599 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1600
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1601 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1602 : "%eax", "%edx", "ecx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1603 );
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1604 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1605 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1606 src+= stride*3;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1607 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1608 {
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1609 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1610 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1611 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
1612 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1613 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1614 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1615 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1616 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1617
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1618 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1619 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1620 * will be called for every 8x8 block and can read & write from line 4-15
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1621 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1622 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1623 * this filter will read lines 4-13 and write 5-11
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1624 * no cliping in C version
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1625 */
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1626 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1627 {
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1628 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1629 src+= stride*4;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1630 asm volatile(
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1631 "leal (%0, %1), %%eax \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1632 "leal (%%eax, %1, 4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1633 "pxor %%mm7, %%mm7 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1634 "movq (%2), %%mm0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1635 // 0 1 2 3 4 5 6 7 8 9 10
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1636 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1637
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1638 #define DEINT_FF(a,b,c,d)\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1639 "movq " #a ", %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1640 "movq " #b ", %%mm2 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1641 "movq " #c ", %%mm3 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1642 "movq " #d ", %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1643 PAVGB(%%mm3, %%mm1) \
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1644 PAVGB(%%mm4, %%mm0) \
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1645 "movq %%mm0, %%mm3 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1646 "punpcklbw %%mm7, %%mm0 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1647 "punpckhbw %%mm7, %%mm3 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1648 "movq %%mm1, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1649 "punpcklbw %%mm7, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1650 "punpckhbw %%mm7, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1651 "psllw $2, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1652 "psllw $2, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1653 "psubw %%mm0, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1654 "psubw %%mm3, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1655 "movq %%mm2, %%mm5 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1656 "movq %%mm2, %%mm0 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1657 "punpcklbw %%mm7, %%mm2 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1658 "punpckhbw %%mm7, %%mm5 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1659 "paddw %%mm2, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1660 "paddw %%mm5, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1661 "psraw $2, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1662 "psraw $2, %%mm4 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1663 "packuswb %%mm4, %%mm1 \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1664 "movq %%mm1, " #b " \n\t"\
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1665
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1666 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1667 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) )
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1668 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1669 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1670
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1671 "movq %%mm0, (%2) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1672 : : "r" (src), "r" (stride), "r"(tmp)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1673 : "%eax", "%edx"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1674 );
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1675 #else
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1676 int x;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1677 src+= stride*4;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1678 for(x=0; x<8; x++)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1679 {
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1680 int t1= tmp[x];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1681 int t2= src[stride*1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1682
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1683 src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1684 t1= src[stride*4];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1685 src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1686 t2= src[stride*6];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1687 src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1688 t1= src[stride*8];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1689 src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1690 tmp[x]= t1;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1691
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1692 src++;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1693 }
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1694 #endif
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1695 }
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1696
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1697 /**
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1698 * Deinterlaces the given block
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1699 * will be called for every 8x8 block and can read & write from line 4-15
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1700 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1701 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1702 * will shift the image up by 1 line (FIXME if this is a problem)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1703 * this filter will read lines 4-13 and write 4-11
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1704 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1705 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1706 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1708 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1709 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1710 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1711 "leal (%%eax, %1, 4), %%edx \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1712 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1713 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1714
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1715 "movq (%0), %%mm0 \n\t" // L0
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1716 "movq (%%eax, %1), %%mm1 \n\t" // L2
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1717 PAVGB(%%mm1, %%mm0) // L0+L2
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1718 "movq (%%eax), %%mm2 \n\t" // L1
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1719 PAVGB(%%mm2, %%mm0)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1720 "movq %%mm0, (%0) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1721 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1722 PAVGB(%%mm0, %%mm2) // L1+L3
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1723 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1724 "movq %%mm2, (%%eax) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1725 "movq (%0, %1, 4), %%mm2 \n\t" // L4
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1726 PAVGB(%%mm2, %%mm1) // L2+L4
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1727 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1728 "movq %%mm1, (%%eax, %1) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1729 "movq (%%edx), %%mm1 \n\t" // L5
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1730 PAVGB(%%mm1, %%mm0) // L3+L5
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1731 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1732 "movq %%mm0, (%%eax, %1, 2) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1733 "movq (%%edx, %1), %%mm0 \n\t" // L6
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1734 PAVGB(%%mm0, %%mm2) // L4+L6
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1735 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1736 "movq %%mm2, (%0, %1, 4) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1737 "movq (%%edx, %1, 2), %%mm2 \n\t" // L7
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1738 PAVGB(%%mm2, %%mm1) // L5+L7
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1739 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1740 "movq %%mm1, (%%edx) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1741 "movq (%0, %1, 8), %%mm1 \n\t" // L8
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1742 PAVGB(%%mm1, %%mm0) // L6+L8
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1743 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1744 "movq %%mm0, (%%edx, %1) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1745 "movq (%%edx, %1, 4), %%mm0 \n\t" // L9
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1746 PAVGB(%%mm0, %%mm2) // L7+L9
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1747 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1748 "movq %%mm2, (%%edx, %1, 2) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1749
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1750
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1751 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1752 : "%eax", "%edx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1753 );
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1754 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1755 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1756 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1757 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1758 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1759 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1760 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1761 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1762 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1763 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1764 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1765 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1766 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1767 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1768 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1769 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1770 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1771
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1772 /**
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1773 * Deinterlaces the given block
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1774 * will be called for every 8x8 block and can read & write from line 4-15,
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1775 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1776 * lines 4-12 will be read into the deblocking filter and should be deinterlaced
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1777 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1778 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1779 {
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1780 #ifdef HAVE_MMX
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1781 src+= 4*stride;
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1782 #ifdef HAVE_MMX2
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1783 asm volatile(
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1784 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1785 "leal (%%eax, %1, 4), %%edx \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1786 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1787 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1788
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1789 "movq (%0), %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1790 "movq (%%eax, %1), %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1791 "movq (%%eax), %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1792 "movq %%mm0, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1793 "pmaxub %%mm1, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1794 "pminub %%mm3, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1795 "pmaxub %%mm2, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1796 "pminub %%mm1, %%mm0 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1797 "movq %%mm0, (%%eax) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1798
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1799 "movq (%0, %1, 4), %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1800 "movq (%%eax, %1, 2), %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1801 "movq %%mm2, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1802 "pmaxub %%mm1, %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1803 "pminub %%mm3, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1804 "pmaxub %%mm0, %%mm1 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1805 "pminub %%mm1, %%mm2 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1806 "movq %%mm2, (%%eax, %1, 2) \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1807
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1808 "movq (%%edx), %%mm2 \n\t" //
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1809 "movq (%%edx, %1), %%mm1 \n\t" //
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1810 "movq %%mm2, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1811 "pmaxub %%mm0, %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1812 "pminub %%mm3, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1813 "pmaxub %%mm1, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1814 "pminub %%mm0, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1815 "movq %%mm2, (%%edx) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1816
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1817 "movq (%%edx, %1, 2), %%mm2 \n\t" //
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1818 "movq (%0, %1, 8), %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1819 "movq %%mm2, %%mm3 \n\t"
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1820 "pmaxub %%mm0, %%mm2 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1821 "pminub %%mm3, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1822 "pmaxub %%mm1, %%mm0 \n\t" //
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1823 "pminub %%mm0, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1824 "movq %%mm2, (%%edx, %1, 2) \n\t"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1825
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1826
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1827 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1828 : "%eax", "%edx"
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1829 );
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1830
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1831 #else // MMX without MMX2
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1832 asm volatile(
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1833 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1834 "leal (%%eax, %1, 4), %%edx \n\t"
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1835 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1836 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1837 "pxor %%mm7, %%mm7 \n\t"
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1838
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1839 #define MEDIAN(a,b,c)\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1840 "movq " #a ", %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1841 "movq " #b ", %%mm2 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1842 "movq " #c ", %%mm1 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1843 "movq %%mm0, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1844 "movq %%mm1, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1845 "movq %%mm2, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1846 "psubusb %%mm1, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1847 "psubusb %%mm2, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1848 "psubusb %%mm0, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1849 "pcmpeqb %%mm7, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1850 "pcmpeqb %%mm7, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1851 "pcmpeqb %%mm7, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1852 "movq %%mm3, %%mm6 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1853 "pxor %%mm4, %%mm3 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1854 "pxor %%mm5, %%mm4 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1855 "pxor %%mm6, %%mm5 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1856 "por %%mm3, %%mm1 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1857 "por %%mm4, %%mm2 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1858 "por %%mm5, %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1859 "pand %%mm2, %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1860 "pand %%mm1, %%mm0 \n\t"\
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1861 "movq %%mm0, " #b " \n\t"
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1862
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1863 MEDIAN((%0), (%%eax), (%%eax, %1))
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1864 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1865 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1866 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1867
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1868 : : "r" (src), "r" (stride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1869 : "%eax", "%edx"
2221
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1870 );
9fd911c931cd minor cleanups
michael
parents: 2203
diff changeset
1871 #endif // MMX
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1872 #else
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1873 //FIXME
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1874 int x;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
1875 src+= 4*stride;
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1876 for(x=0; x<8; x++)
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1877 {
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1878 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1879 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1880 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1881 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1882 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1883 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1884 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1885 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1886 src++;
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1887 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1888 #endif
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1889 }
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
1890
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
1891 #ifdef HAVE_MMX
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1892 /**
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1893 * transposes and shift the given 8x8 Block into dst1 and dst2
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1894 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1895 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1896 {
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1897 asm(
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1898 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1899 "leal (%%eax, %1, 4), %%edx \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1900 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1901 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1902 "movq (%0), %%mm0 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1903 "movq (%%eax), %%mm1 \n\t" // abcdefgh
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1904 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1905 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1906 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1907
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1908 "movq (%%eax, %1), %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1909 "movq (%%eax, %1, 2), %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1910 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1911 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1912 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1913
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1914 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1915 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1916 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1917 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1918 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1919 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1920
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1921 "movd %%mm0, 128(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1922 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1923 "movd %%mm0, 144(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1924 "movd %%mm3, 160(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1925 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1926 "movd %%mm3, 176(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1927 "movd %%mm3, 48(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1928 "movd %%mm2, 192(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1929 "movd %%mm2, 64(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1930 "psrlq $32, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1931 "movd %%mm2, 80(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1932 "movd %%mm1, 96(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1933 "psrlq $32, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1934 "movd %%mm1, 112(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1935
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1936 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1937 "movq (%%edx), %%mm1 \n\t" // abcdefgh
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1938 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1939 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1940 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1941
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1942 "movq (%%edx, %1), %%mm1 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1943 "movq (%%edx, %1, 2), %%mm3 \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1944 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1945 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1946 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1947
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1948 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1949 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1950 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1951 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1952 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1953 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1954
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1955 "movd %%mm0, 132(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1956 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1957 "movd %%mm0, 148(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1958 "movd %%mm3, 164(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1959 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1960 "movd %%mm3, 180(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1961 "movd %%mm3, 52(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1962 "movd %%mm2, 196(%2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1963 "movd %%mm2, 68(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1964 "psrlq $32, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1965 "movd %%mm2, 84(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1966 "movd %%mm1, 100(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1967 "psrlq $32, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1968 "movd %%mm1, 116(%3) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1969
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1970
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1971 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1972 : "%eax", "%edx"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1973 );
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1974 }
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1975
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1976 /**
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1977 * transposes the given 8x8 block
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1978 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
1979 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1980 {
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1981 asm(
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1982 "leal (%0, %1), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1983 "leal (%%eax, %1, 4), %%edx \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1984 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
1985 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1986 "movq (%2), %%mm0 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1987 "movq 16(%2), %%mm1 \n\t" // abcdefgh
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1988 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1989 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1990 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1991
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1992 "movq 32(%2), %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1993 "movq 48(%2), %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1994 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1995 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1996 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1997
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1998 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
1999 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2000 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2001 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2002 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2003 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2004
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2005 "movd %%mm0, (%0) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2006 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2007 "movd %%mm0, (%%eax) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2008 "movd %%mm3, (%%eax, %1) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2009 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2010 "movd %%mm3, (%%eax, %1, 2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2011 "movd %%mm2, (%0, %1, 4) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2012 "psrlq $32, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2013 "movd %%mm2, (%%edx) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2014 "movd %%mm1, (%%edx, %1) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2015 "psrlq $32, %%mm1 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2016 "movd %%mm1, (%%edx, %1, 2) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2017
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2018
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2019 "movq 64(%2), %%mm0 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2020 "movq 80(%2), %%mm1 \n\t" // abcdefgh
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2021 "movq %%mm0, %%mm2 \n\t" // 12345678
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2022 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2023 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2024
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2025 "movq 96(%2), %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2026 "movq 112(%2), %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2027 "movq %%mm1, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2028 "punpcklbw %%mm3, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2029 "punpckhbw %%mm3, %%mm4 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2030
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2031 "movq %%mm0, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2032 "punpcklwd %%mm1, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2033 "punpckhwd %%mm1, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2034 "movq %%mm2, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2035 "punpcklwd %%mm4, %%mm2 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2036 "punpckhwd %%mm4, %%mm1 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2037
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2038 "movd %%mm0, 4(%0) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2039 "psrlq $32, %%mm0 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2040 "movd %%mm0, 4(%%eax) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2041 "movd %%mm3, 4(%%eax, %1) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2042 "psrlq $32, %%mm3 \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2043 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2044 "movd %%mm2, 4(%0, %1, 4) \n\t"
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2045 "psrlq $32, %%mm2 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2046 "movd %%mm2, 4(%%edx) \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2047 "movd %%mm1, 4(%%edx, %1) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2048 "psrlq $32, %%mm1 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2049 "movd %%mm1, 4(%%edx, %1, 2) \n\t"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2050
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2051 :: "r" (dst), "r" (dstStride), "r" (src)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2052 : "%eax", "%edx"
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2053 );
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2054 }
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
2055 #endif
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2056 //static int test=0;
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2057
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2058 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2059 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2060 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2061 // to save a register (FIXME do this outside of the loops)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2062 tempBluredPast[127]= maxNoise[0];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2063 tempBluredPast[128]= maxNoise[1];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2064 tempBluredPast[129]= maxNoise[2];
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2065
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2066 #define FAST_L2_DIFF
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2067 //#define L1_DIFF //u should change the thresholds too if u try that one
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2068 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2069 asm volatile(
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2070 "leal (%2, %2, 2), %%eax \n\t" // 3*stride
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2071 "leal (%2, %2, 4), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2072 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2073 // 0 1 2 3 4 5 6 7 8 9
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2074 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2075 //FIXME reorder?
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2076 #ifdef L1_DIFF //needs mmx2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2077 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2078 "psadbw (%1), %%mm0 \n\t" // |L0-R0|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2079 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2080 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2081 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2082 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2083 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2084 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2085
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2086 "movq (%0, %2, 4), %%mm4 \n\t" // L4
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2087 "paddw %%mm1, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2088 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2089 "movq (%0, %%edx), %%mm5 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2090 "paddw %%mm2, %%mm0 \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2091 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5|
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2092 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2093 "paddw %%mm3, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2094 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2095 "movq (%0, %%ecx), %%mm7 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2096 "paddw %%mm4, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2097 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2098 "paddw %%mm5, %%mm6 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2099 "paddw %%mm7, %%mm6 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2100 "paddw %%mm6, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2101 #elif defined (FAST_L2_DIFF)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2102 "pcmpeqb %%mm7, %%mm7 \n\t"
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
2103 "movq "MANGLE(b80)", %%mm6 \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2104 "pxor %%mm0, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2105 #define L2_DIFF_CORE(a, b)\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2106 "movq " #a ", %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2107 "movq " #b ", %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2108 "pxor %%mm7, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2109 PAVGB(%%mm2, %%mm5)\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2110 "paddb %%mm6, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2111 "movq %%mm5, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2112 "psllw $8, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2113 "pmaddwd %%mm5, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2114 "pmaddwd %%mm2, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2115 "paddd %%mm2, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2116 "psrld $14, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2117 "paddd %%mm5, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2118
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2119 L2_DIFF_CORE((%0), (%1))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2120 L2_DIFF_CORE((%0, %2), (%1, %2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2121 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2122 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2123 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2124 L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2125 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2126 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2127
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2128 #else
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2129 "pxor %%mm7, %%mm7 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2130 "pxor %%mm0, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2131 #define L2_DIFF_CORE(a, b)\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2132 "movq " #a ", %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2133 "movq " #b ", %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2134 "movq %%mm5, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2135 "movq %%mm2, %%mm3 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2136 "punpcklbw %%mm7, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2137 "punpckhbw %%mm7, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2138 "punpcklbw %%mm7, %%mm2 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2139 "punpckhbw %%mm7, %%mm3 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2140 "psubw %%mm2, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2141 "psubw %%mm3, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2142 "pmaddwd %%mm5, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2143 "pmaddwd %%mm1, %%mm1 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2144 "paddd %%mm1, %%mm5 \n\t"\
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2145 "paddd %%mm5, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2146
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2147 L2_DIFF_CORE((%0), (%1))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2148 L2_DIFF_CORE((%0, %2), (%1, %2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2149 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2150 L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2151 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2152 L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2153 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2154 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2155
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2156 #endif
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2157
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2158 "movq %%mm0, %%mm4 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2159 "psrlq $32, %%mm0 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2160 "paddd %%mm0, %%mm4 \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2161 "movd %%mm4, %%ecx \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2162 "shll $2, %%ecx \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2163 "movl %3, %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2164 "addl -4(%%edx), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2165 "addl 4(%%edx), %%ecx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2166 "addl -1024(%%edx), %%ecx \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2167 "addl $4, %%ecx \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2168 "addl 1024(%%edx), %%ecx \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2169 "shrl $3, %%ecx \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2170 "movl %%ecx, (%%edx) \n\t"
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2171
4248
3cdb86beebce mangle for win32 in postproc
atmos4
parents: 3832
diff changeset
2172 // "movl %3, %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2173 // "movl %%ecx, test \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2174 // "jmp 4f \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2175 "cmpl 512(%%edx), %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2176 " jb 2f \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2177 "cmpl 516(%%edx), %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2178 " jb 1f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2179
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2180 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2181 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2182 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2183 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2184 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2185 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2186 "movq (%0, %2, 4), %%mm4 \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2187 "movq (%0, %%edx), %%mm5 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2188 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2189 "movq (%0, %%ecx), %%mm7 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2190 "movq %%mm0, (%1) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2191 "movq %%mm1, (%1, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2192 "movq %%mm2, (%1, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2193 "movq %%mm3, (%1, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2194 "movq %%mm4, (%1, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2195 "movq %%mm5, (%1, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2196 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2197 "movq %%mm7, (%1, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2198 "jmp 4f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2199
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2200 "1: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2201 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2202 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2203 "movq (%0), %%mm0 \n\t" // L0
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2204 PAVGB((%1), %%mm0) // L0
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2205 "movq (%0, %2), %%mm1 \n\t" // L1
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2206 PAVGB((%1, %2), %%mm1) // L1
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2207 "movq (%0, %2, 2), %%mm2 \n\t" // L2
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2208 PAVGB((%1, %2, 2), %%mm2) // L2
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2209 "movq (%0, %%eax), %%mm3 \n\t" // L3
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2210 PAVGB((%1, %%eax), %%mm3) // L3
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2211 "movq (%0, %2, 4), %%mm4 \n\t" // L4
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2212 PAVGB((%1, %2, 4), %%mm4) // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2213 "movq (%0, %%edx), %%mm5 \n\t" // L5
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2214 PAVGB((%1, %%edx), %%mm5) // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2215 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2216 PAVGB((%1, %%eax, 2), %%mm6) // L6
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2217 "movq (%0, %%ecx), %%mm7 \n\t" // L7
5980
3b078401d610 3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents: 5787
diff changeset
2218 PAVGB((%1, %%ecx), %%mm7) // L7
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2219 "movq %%mm0, (%1) \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2220 "movq %%mm1, (%1, %2) \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2221 "movq %%mm2, (%1, %2, 2) \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2222 "movq %%mm3, (%1, %%eax) \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2223 "movq %%mm4, (%1, %2, 4) \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2224 "movq %%mm5, (%1, %%edx) \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2225 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2226 "movq %%mm7, (%1, %%ecx) \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2227 "movq %%mm0, (%0) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2228 "movq %%mm1, (%0, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2229 "movq %%mm2, (%0, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2230 "movq %%mm3, (%0, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2231 "movq %%mm4, (%0, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2232 "movq %%mm5, (%0, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2233 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2234 "movq %%mm7, (%0, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2235 "jmp 4f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2236
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2237 "2: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2238 "cmpl 508(%%edx), %%ecx \n\t"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2239 " jb 3f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2240
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2241 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2242 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2243 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2244 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2245 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2246 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2247 "movq (%1), %%mm4 \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2248 "movq (%1, %2), %%mm5 \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2249 "movq (%1, %2, 2), %%mm6 \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2250 "movq (%1, %%eax), %%mm7 \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2251 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2252 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2253 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2254 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2255 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2256 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2257 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2258 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2259 "movq %%mm0, (%1) \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2260 "movq %%mm1, (%1, %2) \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2261 "movq %%mm2, (%1, %2, 2) \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2262 "movq %%mm3, (%1, %%eax) \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2263 "movq %%mm0, (%0) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2264 "movq %%mm1, (%0, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2265 "movq %%mm2, (%0, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2266 "movq %%mm3, (%0, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2267
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2268 "movq (%0, %2, 4), %%mm0 \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2269 "movq (%0, %%edx), %%mm1 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2270 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2271 "movq (%0, %%ecx), %%mm3 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2272 "movq (%1, %2, 4), %%mm4 \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2273 "movq (%1, %%edx), %%mm5 \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2274 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2275 "movq (%1, %%ecx), %%mm7 \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2276 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2277 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2278 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2279 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2280 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2281 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2282 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2283 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2284 "movq %%mm0, (%1, %2, 4) \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2285 "movq %%mm1, (%1, %%edx) \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2286 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2287 "movq %%mm3, (%1, %%ecx) \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2288 "movq %%mm0, (%0, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2289 "movq %%mm1, (%0, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2290 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2291 "movq %%mm3, (%0, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2292 "jmp 4f \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2293
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2294 "3: \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2295 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2296 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2297 "movq (%0), %%mm0 \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2298 "movq (%0, %2), %%mm1 \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2299 "movq (%0, %2, 2), %%mm2 \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2300 "movq (%0, %%eax), %%mm3 \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2301 "movq (%1), %%mm4 \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2302 "movq (%1, %2), %%mm5 \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2303 "movq (%1, %2, 2), %%mm6 \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2304 "movq (%1, %%eax), %%mm7 \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2305 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2306 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2307 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2308 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2309 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2310 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2311 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2312 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2313 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2314 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2315 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2316 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2317 "movq %%mm0, (%1) \n\t" // R0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2318 "movq %%mm1, (%1, %2) \n\t" // R1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2319 "movq %%mm2, (%1, %2, 2) \n\t" // R2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2320 "movq %%mm3, (%1, %%eax) \n\t" // R3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2321 "movq %%mm0, (%0) \n\t" // L0
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2322 "movq %%mm1, (%0, %2) \n\t" // L1
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2323 "movq %%mm2, (%0, %2, 2) \n\t" // L2
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2324 "movq %%mm3, (%0, %%eax) \n\t" // L3
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2325
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2326 "movq (%0, %2, 4), %%mm0 \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2327 "movq (%0, %%edx), %%mm1 \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2328 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2329 "movq (%0, %%ecx), %%mm3 \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2330 "movq (%1, %2, 4), %%mm4 \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2331 "movq (%1, %%edx), %%mm5 \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2332 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2333 "movq (%1, %%ecx), %%mm7 \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2334 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2335 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2336 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2337 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2338 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2339 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2340 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2341 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2342 PAVGB(%%mm4, %%mm0)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2343 PAVGB(%%mm5, %%mm1)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2344 PAVGB(%%mm6, %%mm2)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2345 PAVGB(%%mm7, %%mm3)
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2346 "movq %%mm0, (%1, %2, 4) \n\t" // R4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2347 "movq %%mm1, (%1, %%edx) \n\t" // R5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2348 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2349 "movq %%mm3, (%1, %%ecx) \n\t" // R7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2350 "movq %%mm0, (%0, %2, 4) \n\t" // L4
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2351 "movq %%mm1, (%0, %%edx) \n\t" // L5
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2352 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2353 "movq %%mm3, (%0, %%ecx) \n\t" // L7
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2354
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2355 "4: \n\t"
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2356
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2357 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2358 : "%eax", "%edx", "%ecx", "memory"
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2359 );
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2360 //printf("%d\n", test);
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2361 #else
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2362 int y;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2363 int d=0;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2364 int sysd=0;
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2365 int i;
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2366
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2367 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2368 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2369 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2370 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2371 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2372 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2373 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2374 int d1=ref - cur;
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2375 // if(x==0 || x==7) d1+= d1>>1;
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2376 // if(y==0 || y==7) d1+= d1>>1;
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2377 // d+= ABS(d1);
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2378 d+= d1*d1;
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2379 sysd+= d1;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2380 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2381 }
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2382 i=d;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2383 d= (
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2384 4*d
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2385 +(*(tempBluredPast-256))
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2386 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2387 +(*(tempBluredPast+256))
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2388 +4)>>3;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2389 *tempBluredPast=i;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2390 // ((*tempBluredPast)*3 + d + 2)>>2;
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2391
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2392 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2393 /*
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2394 Switch between
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2395 1 0 0 0 0 0 0 (0)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2396 64 32 16 8 4 2 1 (1)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2397 64 48 36 27 20 15 11 (33) (approx)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2398 64 56 49 43 37 33 29 (200) (approx)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2399 */
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2400 if(d > maxNoise[1])
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2401 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2402 if(d < maxNoise[2])
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2403 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2404 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2405 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2406 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2407 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2408 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2409 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2410 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2411 tempBlured[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2412 src[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2413 (ref + cur + 1)>>1;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2414 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2415 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2416 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2417 else
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2418 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2419 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2420 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2421 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2422 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2423 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2424 tempBlured[ x + y*stride ]= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2425 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2426 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2427 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2428 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2429 else
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2430 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2431 if(d < maxNoise[0])
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2432 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2433 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2434 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2435 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2436 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2437 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2438 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2439 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2440 tempBlured[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2441 src[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2442 (ref*7 + cur + 4)>>3;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2443 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2444 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2445 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2446 else
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2447 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2448 for(y=0; y<8; y++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2449 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2450 int x;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2451 for(x=0; x<8; x++)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2452 {
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2453 int ref= tempBlured[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2454 int cur= src[ x + y*stride ];
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2455 tempBlured[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2456 src[ x + y*stride ]=
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2457 (ref*3 + cur + 2)>>2;
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2458 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2459 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2460 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2461 }
2895
dd3fabd01df0 temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents: 2860
diff changeset
2462 #endif
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2463 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2464
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2465 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2466 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2467
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2468 /**
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2469 * Copies a block from src to dst and fixes the blacklevel
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2470 * levelFix == 0 -> dont touch the brighness & contrast
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2471 */
7220
e3ecccc7e505 warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents: 6949
diff changeset
2472 #undef SCALED_CPY
e3ecccc7e505 warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents: 6949
diff changeset
2473
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2474 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2475 int levelFix, int64_t *packedOffsetAndScale)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2476 {
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
2477 #ifndef HAVE_MMX
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2478 int i;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
2479 #endif
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2480 if(levelFix)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2481 {
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2482 #ifdef HAVE_MMX
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2483 asm volatile(
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2484 "movq (%%eax), %%mm2 \n\t" // packedYOffset
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2485 "movq 8(%%eax), %%mm3 \n\t" // packedYScale
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2486 "leal (%2,%4), %%eax \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2487 "leal (%3,%5), %%edx \n\t"
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2488 "pxor %%mm4, %%mm4 \n\t"
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2489 #ifdef HAVE_MMX2
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2490 #define SCALED_CPY(src1, src2, dst1, dst2) \
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2491 "movq " #src1 ", %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2492 "movq " #src1 ", %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2493 "movq " #src2 ", %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2494 "movq " #src2 ", %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2495 "punpcklbw %%mm0, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2496 "punpckhbw %%mm5, %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2497 "punpcklbw %%mm1, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2498 "punpckhbw %%mm6, %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2499 "pmulhuw %%mm3, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2500 "pmulhuw %%mm3, %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2501 "pmulhuw %%mm3, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2502 "pmulhuw %%mm3, %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2503 "psubw %%mm2, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2504 "psubw %%mm2, %%mm5 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2505 "psubw %%mm2, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2506 "psubw %%mm2, %%mm6 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2507 "packuswb %%mm5, %%mm0 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2508 "packuswb %%mm6, %%mm1 \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2509 "movq %%mm0, " #dst1 " \n\t"\
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2510 "movq %%mm1, " #dst2 " \n\t"\
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2511
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2512 #else //HAVE_MMX2
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2513 #define SCALED_CPY(src1, src2, dst1, dst2) \
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2514 "movq " #src1 ", %%mm0 \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2515 "movq " #src1 ", %%mm5 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2516 "punpcklbw %%mm4, %%mm0 \n\t"\
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2517 "punpckhbw %%mm4, %%mm5 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2518 "psubw %%mm2, %%mm0 \n\t"\
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2519 "psubw %%mm2, %%mm5 \n\t"\
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2520 "movq " #src2 ", %%mm1 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2521 "psllw $6, %%mm0 \n\t"\
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2522 "psllw $6, %%mm5 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2523 "pmulhw %%mm3, %%mm0 \n\t"\
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2524 "movq " #src2 ", %%mm6 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2525 "pmulhw %%mm3, %%mm5 \n\t"\
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2526 "punpcklbw %%mm4, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2527 "punpckhbw %%mm4, %%mm6 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2528 "psubw %%mm2, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2529 "psubw %%mm2, %%mm6 \n\t"\
2394
1cc35422b752 negative black bugfix
michael
parents: 2389
diff changeset
2530 "psllw $6, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2531 "psllw $6, %%mm6 \n\t"\
2181
d90f8fc7ead6 fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents: 2180
diff changeset
2532 "pmulhw %%mm3, %%mm1 \n\t"\
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2533 "pmulhw %%mm3, %%mm6 \n\t"\
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2534 "packuswb %%mm5, %%mm0 \n\t"\
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2535 "packuswb %%mm6, %%mm1 \n\t"\
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2536 "movq %%mm0, " #dst1 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2537 "movq %%mm1, " #dst2 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2538
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2539 #endif //!HAVE_MMX2
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2540
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2541 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2542 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2543 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2544 "leal (%%eax,%4,4), %%eax \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2545 "leal (%%edx,%5,4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2546 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2547
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2548
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2549 : "=&a" (packedOffsetAndScale)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2550 : "0" (packedOffsetAndScale),
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2551 "r"(src),
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2552 "r"(dst),
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2553 "r" (srcStride),
2401
bc69d7c0e1dc brightness / contrast fix/copy optimizations +2% speedup
michael
parents: 2394
diff changeset
2554 "r" (dstStride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2555 : "%edx"
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2556 );
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2557 #else
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2558 for(i=0; i<8; i++)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2559 memcpy( &(dst[dstStride*i]),
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2560 &(src[srcStride*i]), BLOCK_SIZE);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2561 #endif
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2562 }
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2563 else
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2564 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2565 #ifdef HAVE_MMX
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2566 asm volatile(
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2567 "leal (%0,%2), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2568 "leal (%1,%3), %%edx \n\t"
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2569
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2570 #define SIMPLE_CPY(src1, src2, dst1, dst2) \
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2571 "movq " #src1 ", %%mm0 \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2572 "movq " #src2 ", %%mm1 \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2573 "movq %%mm0, " #dst1 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2574 "movq %%mm1, " #dst2 " \n\t"\
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2575
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2576 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2577 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2578 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2579 "leal (%%eax,%2,4), %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2580 "leal (%%edx,%3,4), %%edx \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2581 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
3037
3fc9a8b9f178 1% speedup
michael
parents: 3032
diff changeset
2582
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2583 : : "r" (src),
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2584 "r" (dst),
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2585 "r" (srcStride),
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2586 "r" (dstStride)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2587 : "%eax", "%edx"
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2588 );
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2589 #else
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2590 for(i=0; i<8; i++)
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2591 memcpy( &(dst[dstStride*i]),
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2592 &(src[srcStride*i]), BLOCK_SIZE);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2593 #endif
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2594 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2595 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2596
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2597 /**
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2598 * Duplicates the given 8 src pixels ? times upward
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2599 */
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2600 static inline void RENAME(duplicate)(uint8_t src[], int stride)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2601 {
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2602 #ifdef HAVE_MMX
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2603 asm volatile(
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2604 "movq (%0), %%mm0 \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2605 "addl %1, %0 \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2606 "movq %%mm0, (%0) \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2607 "movq %%mm0, (%0, %1) \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2608 "movq %%mm0, (%0, %1, 2) \n\t"
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2609 : "+r" (src)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2610 : "r" (-stride)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2611 );
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2612 #else
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2613 int i;
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2614 uint8_t *p=src;
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2615 for(i=0; i<3; i++)
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2616 {
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2617 p-= stride;
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2618 memcpy(p, src, 8);
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2619 }
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2620 #endif
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2621 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2622
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2623 /**
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2624 * Filters array of bytes (Y or U or V values)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2625 */
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2626 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2627 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2628 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2629 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2630 int x,y;
3154
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2631 #ifdef COMPILE_TIME_MODE
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2632 const int mode= COMPILE_TIME_MODE;
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2633 #else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2634 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3154
b2e24fec97bc compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents: 3099
diff changeset
2635 #endif
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2636 int black=0, white=255; // blackest black and whitest white in the picture
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2637 int QPCorrecture= 256*256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2638
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2639 int copyAhead;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2640
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2641 //FIXME remove
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2642 uint64_t * const yHistogram= c.yHistogram;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2643 uint8_t * const tempSrc= c.tempSrc;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2644 uint8_t * const tempDst= c.tempDst;
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
2645
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2646 c.dcOffset= c.ppMode.maxDcDiff;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2647 c.dcThreshold= c.ppMode.maxDcDiff*2 + 1;
3832
d05cfaf5f0f2 minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents: 3817
diff changeset
2648
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2649 #ifdef HAVE_MMX
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2650 c.mmxDcOffset= 0x7F - c.dcOffset;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2651 c.mmxDcThreshold= 0x7F - c.dcThreshold;
3832
d05cfaf5f0f2 minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents: 3817
diff changeset
2652
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2653 c.mmxDcOffset*= 0x0101010101010101LL;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2654 c.mmxDcThreshold*= 0x0101010101010101LL;
2899
6885bf566271 temp denoiser:
michael
parents: 2895
diff changeset
2655 #endif
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2656
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2657 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2658 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2659 || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2660 else if( (mode & V_DEBLOCK)
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2661 || (mode & LINEAR_IPOL_DEINT_FILTER)
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2662 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2663 else if(mode & V_X1_FILTER) copyAhead=11;
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2664 // else if(mode & V_RK1_FILTER) copyAhead=10;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2665 else if(mode & DERING) copyAhead=9;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2666 else copyAhead=8;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2667
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2668 copyAhead-= 8;
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2669
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2670 if(!isColor)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2671 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2672 uint64_t sum= 0;
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2673 int i;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2674 uint64_t maxClipped;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2675 uint64_t clipped;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2676 double scale;
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2677
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2678 c.frameNum++;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2679 // first frame is fscked so we ignore it
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2680 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2681
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2682 for(i=0; i<256; i++)
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2683 {
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2684 sum+= yHistogram[i];
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2685 // printf("%d ", yHistogram[i]);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2686 }
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2687 // printf("\n\n");
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2688
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2689 /* we allways get a completly black picture first */
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2690 maxClipped= (uint64_t)(sum * maxClippedThreshold);
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2691
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2692 clipped= sum;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2693 for(black=255; black>0; black--)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2694 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2695 if(clipped < maxClipped) break;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2696 clipped-= yHistogram[black];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2697 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2698
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2699 clipped= sum;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2700 for(white=0; white<256; white++)
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2701 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2702 if(clipped < maxClipped) break;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2703 clipped-= yHistogram[white];
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2704 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2705
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2706 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2707
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2708 #ifdef HAVE_MMX2
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2709 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2710 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2711 #else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2712 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2713 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3171
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2714 #endif
3a325e7f6e1d faster brightness correcture in MMX2
michael
parents: 3154
diff changeset
2715
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2716 c.packedYOffset|= c.packedYOffset<<32;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2717 c.packedYOffset|= c.packedYOffset<<16;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2718
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2719 c.packedYScale|= c.packedYScale<<32;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2720 c.packedYScale|= c.packedYScale<<16;
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2721
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2722 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2723 else QPCorrecture= 256*256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2724 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2725 else
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2726 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2727 c.packedYScale= 0x0100010001000100LL;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2728 c.packedYOffset= 0;
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2729 QPCorrecture= 256*256;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2730 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2731
2742
d5636499cafd minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents: 2595
diff changeset
2732 /* copy & deinterlace first row of blocks */
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2733 y=-BLOCK_SIZE;
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2734 {
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2735 uint8_t *srcBlock= &(src[y*srcStride]);
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2736 uint8_t *dstBlock= tempDst + dstStride;
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2737
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2738 // From this point on it is guranteed that we can read and write 16 lines downward
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2739 // finish 1 block before the next otherwise we´ll might have a problem
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2740 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2741 for(x=0; x<width; x+=BLOCK_SIZE)
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2742 {
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2743
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2744 #ifdef HAVE_MMX2
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2745 /*
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2746 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2747 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2748 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2749 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2750 */
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2751
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2752 asm(
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2753 "movl %4, %%eax \n\t"
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2754 "shrl $2, %%eax \n\t"
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2755 "andl $6, %%eax \n\t"
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2756 "addl %5, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2757 "movl %%eax, %%edx \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2758 "imul %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2759 "imul %3, %%edx \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2760 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2761 "prefetcht0 32(%%edx, %2) \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2762 "addl %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2763 "addl %3, %%edx \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2764 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2765 "prefetcht0 32(%%edx, %2) \n\t"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2766 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2767 "m" (x), "m" (copyAhead)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2768 : "%eax", "%edx"
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2769 );
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2770
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2771 #elif defined(HAVE_3DNOW)
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2772 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2773 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2774 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2775 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2776 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2777 */
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2778 #endif
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2779
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2780 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2781 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2782
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2783 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2784
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2785 if(mode & LINEAR_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2786 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2787 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2788 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2789 else if(mode & MEDIAN_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2790 RENAME(deInterlaceMedian)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2791 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2792 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2793 else if(mode & FFMPEG_DEINT_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2794 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2795 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2796 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2797 */
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2798 dstBlock+=8;
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2799 srcBlock+=8;
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2800 }
4403
f3dc8bf8383a top row bugfix
michael
parents: 4399
diff changeset
2801 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride );
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
2802 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2803
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2804 //printf("\n");
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2805 for(y=0; y<height; y+=BLOCK_SIZE)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2806 {
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2807 //1% speedup if these are here instead of the inner loop
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2808 uint8_t *srcBlock= &(src[y*srcStride]);
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2809 uint8_t *dstBlock= &(dst[y*dstStride]);
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2810 #ifdef HAVE_MMX
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2811 uint8_t *tempBlock1= c.tempBlocks;
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2812 uint8_t *tempBlock2= c.tempBlocks + 8;
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2813 #endif
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2814 #ifdef ARCH_X86
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2815 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2816 int QPDelta= isColor ? (-1) : 1<<31;
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2817 int QPFrac= 1<<30;
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2818 #endif
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2819 int QP=0;
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2820 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2821 if not than use a temporary buffer */
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2822 if(y+15 >= height)
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2823 {
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2824 int i;
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2825 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2826 blockcopy to dst later */
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2827 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2828 srcStride*MAX(height-y-copyAhead, 0) );
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2829
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2830 /* duplicate last line of src to fill the void upto line (copyAhead+7) */
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2831 for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2832 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2833
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2834 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2835 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2836
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2837 /* duplicate last line of dst to fill the void upto line (copyAhead) */
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2838 for(i=height-y+1; i<=copyAhead; i++)
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2839 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2840
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2841 dstBlock= tempDst + dstStride;
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2842 srcBlock= tempSrc;
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2843 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2844 //printf("\n");
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2845
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
2846 // From this point on it is guranteed that we can read and write 16 lines downward
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2847 // finish 1 block before the next otherwise we´ll might have a problem
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2848 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2849 for(x=0; x<width; x+=BLOCK_SIZE)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2850 {
2168
21a8f158d19f bugfixes: last 3 lines not brightness/contrast corrected
michael
parents: 2159
diff changeset
2851 const int stride= dstStride;
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2852 #ifdef HAVE_MMX
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2853 uint8_t *tmpXchg;
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2854 #endif
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2855 #ifdef ARCH_X86
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2856 QP= *QPptr;
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2857 asm volatile(
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2858 "addl %2, %1 \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2859 "sbbl %%eax, %%eax \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2860 "shll $2, %%eax \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2861 "subl %%eax, %0 \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2862 : "+r" (QPptr), "+m" (QPFrac)
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2863 : "r" (QPDelta)
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2864 : "%eax"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2865 );
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2866 #else
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
2867 QP= isColor ?
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2868 QPs[(y>>3)*QPStride + (x>>3)]:
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2869 QPs[(y>>4)*QPStride + (x>>4)];
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2870 #endif
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2871 if(!isColor)
2428
85cda20c530f more speed
michael
parents: 2416
diff changeset
2872 {
4399
a4098aec828a minor QP bugfix
michael
parents: 4253
diff changeset
2873 QP= (QP* QPCorrecture + 256*128)>>16;
2742
d5636499cafd minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents: 2595
diff changeset
2874 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2428
85cda20c530f more speed
michael
parents: 2416
diff changeset
2875 }
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2876 //printf("%d ", QP);
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2877 c.QP= QP;
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2878 #ifdef HAVE_MMX
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2879 asm volatile(
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2880 "movd %1, %%mm7 \n\t"
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2881 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2882 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2883 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2884 "movq %%mm7, %0 \n\t"
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2885 : "=m" (c.pQPb)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2886 : "r" (QP)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2887 );
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2888 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2889
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2890
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2891 #ifdef HAVE_MMX2
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2892 /*
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2893 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2894 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2895 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2896 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2897 */
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2898
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2899 asm(
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2900 "movl %4, %%eax \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2901 "shrl $2, %%eax \n\t"
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2902 "andl $6, %%eax \n\t"
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2903 "addl %5, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2904 "movl %%eax, %%edx \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2905 "imul %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2906 "imul %3, %%edx \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2907 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2908 "prefetcht0 32(%%edx, %2) \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2909 "addl %1, %%eax \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2910 "addl %3, %%edx \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2911 "prefetchnta 32(%%eax, %0) \n\t"
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2912 "prefetcht0 32(%%edx, %2) \n\t"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2913 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
2914 "m" (x), "m" (copyAhead)
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2915 : "%eax", "%edx"
2437
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2916 );
de434f02dee6 more speed
michael
parents: 2428
diff changeset
2917
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2918 #elif defined(HAVE_3DNOW)
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2919 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2920 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2921 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2922 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2923 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2924 */
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2925 #endif
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2926
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2927 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2928 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2929
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2930 if(mode & LINEAR_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2931 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2932 else if(mode & LINEAR_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2933 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2934 else if(mode & MEDIAN_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2935 RENAME(deInterlaceMedian)(dstBlock, dstStride);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2936 else if(mode & CUBIC_IPOL_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2937 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2938 else if(mode & FFMPEG_DEINT_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2939 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2940 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2941 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2203
f90b6e259dc8 rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents: 2195
diff changeset
2942 */
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2943
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2944 /* only deblock if we have 2 blocks */
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2945 if(y + 8 < height)
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2946 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2947 if(mode & V_X1_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2948 RENAME(vertX1Filter)(dstBlock, stride, &c);
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2949 else if(mode & V_DEBLOCK)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2950 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2951 if( RENAME(isVertDC)(dstBlock, stride, &c))
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2952 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2953 if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2954 RENAME(doVertLowPass)(dstBlock, stride, &c);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2955 }
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2956 else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2957 RENAME(doVertDefFilter)(dstBlock, stride, &c);
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2958 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2959 }
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2960
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2961 #ifdef HAVE_MMX
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2962 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2963 #endif
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
2964 /* check if we have a previous block to deblock it with dstBlock */
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
2965 if(x - 8 >= 0)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2966 {
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2967 #ifdef HAVE_MMX
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2968 if(mode & H_X1_FILTER)
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2969 RENAME(vertX1Filter)(tempBlock1, 16, &c);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2970 else if(mode & H_DEBLOCK)
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2971 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2972 if( RENAME(isVertDC)(tempBlock1, 16, &c))
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2973 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2974 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2975 RENAME(doVertLowPass)(tempBlock1, 16, &c);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2976 }
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2977 else
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2978 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2979 }
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2980
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
2981 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2982
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2983 #else
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2984 if(mode & H_X1_FILTER)
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2985 horizX1Filter(dstBlock-4, stride, QP);
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2986 else if(mode & H_DEBLOCK)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2987 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
2988 if( isHorizDC(dstBlock-4, stride, &c))
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2989 {
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2990 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2991 doHorizLowPass(dstBlock-4, stride, QP);
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
2992 }
2300
e10f7dc4938f more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents: 2286
diff changeset
2993 else
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2994 doHorizDefFilter(dstBlock-4, stride, QP);
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
2995 }
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
2996 #endif
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2997 if(mode & DERING)
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2998 {
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
2999 //FIXME filter first line
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3000 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3001 }
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3002
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3003 if(mode & TEMP_NOISE_FILTER)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3004 {
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
3005 RENAME(tempNoiseReducer)(dstBlock-8, stride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3006 c.tempBlured[isColor] + y*dstStride + x,
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3007 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3008 c.ppMode.maxTmpNoise);
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3009 }
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3010 }
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3011
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3012 dstBlock+=8;
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3013 srcBlock+=8;
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3014
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
3015 #ifdef HAVE_MMX
2454
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3016 tmpXchg= tempBlock1;
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3017 tempBlock1= tempBlock2;
b74c2a08eac9 much better horizontal filters (transpose & use the vertical ones) :)
michael
parents: 2437
diff changeset
3018 tempBlock2 = tmpXchg;
2461
60f16575bece fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents: 2454
diff changeset
3019 #endif
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3020 }
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3021
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3022 if(mode & DERING)
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3023 {
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3024 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3025 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3026
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3027 if((mode & TEMP_NOISE_FILTER))
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3028 {
3099
897d46457708 runtime cpu detection
michael
parents: 3094
diff changeset
3029 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3030 c.tempBlured[isColor] + y*dstStride + x,
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3031 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3032 c.ppMode.maxTmpNoise);
2860
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3033 }
08b833fb875a temporal noise reducer in C (-pp 0x100000)
michael
parents: 2808
diff changeset
3034
2595
6c1d6f508466 deinterlace bugfix
michael
parents: 2586
diff changeset
3035 /* did we use a tmp buffer for the last lines*/
2285
4840e356d0d3 fixed a bug in the tmp buffer
michael
parents: 2246
diff changeset
3036 if(y+15 >= height)
2246
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3037 {
f7c1485b33be fixed the height%8!=0 bug
michael
parents: 2231
diff changeset
3038 uint8_t *dstBlock= &(dst[y*dstStride]);
2473
94a0265c408c dering in mmx2
michael
parents: 2461
diff changeset
3039 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3040 }
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3041 /*
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3042 for(x=0; x<width; x+=32)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3043 {
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3044 volatile int i;
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3045 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3046 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3031
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3047 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3048 // + dstBlock[x +13*dstStride]
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3049 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3050 }*/
86e1a0f4f0bc cleanup
michael
parents: 3013
diff changeset
3051 }
2159
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3052 #ifdef HAVE_3DNOW
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3053 asm volatile("femms");
795f3d022657 fixed a bug in the horizontal default filter
arpi
parents: 2158
diff changeset
3054 #elif defined (HAVE_MMX)
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3055 asm volatile("emms");
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3056 #endif
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3057
3013
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3058 #ifdef DEBUG_BRIGHTNESS
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3059 if(!isColor)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3060 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3061 int max=1;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3062 int i;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3063 for(i=0; i<256; i++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3064 if(yHistogram[i] > max) max=yHistogram[i];
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3065
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3066 for(i=1; i<256; i++)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3067 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3068 int x;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3069 int start=yHistogram[i-1]/(max/256+1);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3070 int end=yHistogram[i]/(max/256+1);
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3071 int inc= end > start ? 1 : -1;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3072 for(x=start; x!=end+inc; x+=inc)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3073 dst[ i*dstStride + x]+=128;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3074 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3075
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3076 for(i=0; i<100; i+=2)
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3077 {
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3078 dst[ (white)*dstStride + i]+=128;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3079 dst[ (black)*dstStride + i]+=128;
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3080 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3081
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3082 }
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3083 #endif
71384f064a3e faster mmx2 / 3dnow deblocking filter
michael
parents: 2899
diff changeset
3084
7946
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3085 *c2= c; //copy local context back
f483ab704252 postprocessing cleanup:
michael
parents: 7220
diff changeset
3086
2158
508468a75be0 new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff changeset
3087 }