Mercurial > mplayer.hg
annotate postproc/postprocess_template.c @ 7946:f483ab704252
postprocessing cleanup:
remove opendivx #ifdefs
remove rk1 filter
remove unused / obsolete stuff
add -1,4,2,4,-1 deinterlacing filter (ffmpeg uses that)
threadsafe / no more non-const globals
some optimizations
different strides for Y,U,V possible
remove ebx usage (someone really should fix gcc, this is really lame)
change the dering filter slightly (tell me if its worse for any files)
author | michael |
---|---|
date | Mon, 28 Oct 2002 19:31:04 +0000 |
parents | e3ecccc7e505 |
children | 5a6cbe774760 |
rev | line source |
---|---|
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
4399 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
3099 | 19 #undef PAVGB |
20 #undef PMINUB | |
21 #undef PMAXUB | |
2189 | 22 |
23 #ifdef HAVE_MMX2 | |
24 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
25 #elif defined (HAVE_3DNOW) | |
26 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
27 #endif | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
28 |
2477 | 29 #ifdef HAVE_MMX2 |
30 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
31 #elif defined (HAVE_MMX) | |
32 #define PMINUB(b,a,t) \ | |
33 "movq " #a ", " #t " \n\t"\ | |
34 "psubusb " #b ", " #t " \n\t"\ | |
35 "psubb " #t ", " #a " \n\t" | |
36 #endif | |
37 | |
38 #ifdef HAVE_MMX2 | |
39 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
40 #elif defined (HAVE_MMX) | |
41 #define PMAXUB(a,b) \ | |
42 "psubusb " #a ", " #b " \n\t"\ | |
43 "paddb " #a ", " #b " \n\t" | |
44 #endif | |
45 | |
46 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
47 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
7946 | 48 #ifdef HAVE_MMX |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
49 /** |
2246 | 50 * Check if the middle 8x8 Block in the given 8x16 block is flat |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
51 */ |
7946 | 52 static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){ |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
53 int numEq= 0; |
2246 | 54 src+= stride*4; // src points to begin of the 8x8 Block |
2413 | 55 asm volatile( |
56 "leal (%1, %2), %%eax \n\t" | |
57 // 0 1 2 3 4 5 6 7 8 9 | |
7946 | 58 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
59 "movq %3, %%mm7 \n\t" // mm7 = 0x7F | |
60 "movq %4, %%mm6 \n\t" // mm6 = 0x7D | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
61 "movq (%1), %%mm0 \n\t" |
2413 | 62 "movq (%%eax), %%mm1 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
63 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
64 "paddb %%mm7, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
65 "pcmpgtb %%mm6, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
66 |
2413 | 67 "movq (%%eax,%2), %%mm2 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
68 "psubb %%mm2, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
69 "paddb %%mm7, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
70 "pcmpgtb %%mm6, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
71 "paddb %%mm1, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
72 |
2413 | 73 "movq (%%eax, %2, 2), %%mm1 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
74 "psubb %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
75 "paddb %%mm7, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
76 "pcmpgtb %%mm6, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
77 "paddb %%mm2, %%mm0 \n\t" |
7946 | 78 |
79 "leal (%%eax, %2, 4), %%eax \n\t" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
80 |
2413 | 81 "movq (%1, %2, 4), %%mm2 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
82 "psubb %%mm2, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
83 "paddb %%mm7, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
84 "pcmpgtb %%mm6, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
85 "paddb %%mm1, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
86 |
7946 | 87 "movq (%%eax), %%mm1 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
88 "psubb %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
89 "paddb %%mm7, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
90 "pcmpgtb %%mm6, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
91 "paddb %%mm2, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
92 |
7946 | 93 "movq (%%eax, %2), %%mm2 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
94 "psubb %%mm2, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
95 "paddb %%mm7, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
96 "pcmpgtb %%mm6, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
97 "paddb %%mm1, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
98 |
7946 | 99 "movq (%%eax, %2, 2), %%mm1 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
100 "psubb %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
101 "paddb %%mm7, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
102 "pcmpgtb %%mm6, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
103 "paddb %%mm2, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
104 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
105 " \n\t" |
3093 | 106 #ifdef HAVE_MMX2 |
107 "pxor %%mm7, %%mm7 \n\t" | |
108 "psadbw %%mm7, %%mm0 \n\t" | |
109 #else | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
110 "movq %%mm0, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
111 "psrlw $8, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
112 "paddb %%mm1, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
113 "movq %%mm0, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
114 "psrlq $16, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
115 "paddb %%mm1, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
116 "movq %%mm0, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
117 "psrlq $32, %%mm0 \n\t" |
3093 | 118 "paddb %%mm1, %%mm0 \n\t" |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
119 #endif |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
120 "movd %%mm0, %0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
121 : "=r" (numEq) |
7946 | 122 : "r" (src), "r" (stride), "m" (c->mmxDcOffset), "m" (c->mmxDcThreshold) |
123 : "%eax" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
124 ); |
3093 | 125 numEq= (-numEq) &0xFF; |
7946 | 126 return numEq > c->ppMode.flatnessThreshold; |
127 } | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
128 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
129 |
7946 | 130 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
131 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
132 #ifdef HAVE_MMX |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
133 int isOk; |
2246 | 134 src+= stride*3; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
135 asm volatile( |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
136 "movq (%1, %2), %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
137 "movq (%1, %2, 8), %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
138 "movq %%mm0, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
139 "psubusb %%mm1, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
140 "psubusb %%mm2, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
141 "por %%mm1, %%mm0 \n\t" // ABS Diff |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
142 |
7946 | 143 "movq %3, %%mm7 \n\t" // QP,..., QP |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
144 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
145 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 |
7946 | 146 "packssdw %%mm0, %%mm0 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
147 "movd %%mm0, %0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
148 : "=r" (isOk) |
7946 | 149 : "r" (src), "r" (stride), "m" (c->pQPb) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
150 ); |
7946 | 151 return isOk==0; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
152 #else |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
153 int x; |
7946 | 154 const int QP= c->QP; |
2246 | 155 src+= stride*3; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
156 for(x=0; x<BLOCK_SIZE; x++) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
157 { |
7946 | 158 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
159 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
160 |
7946 | 161 return 1; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
162 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
163 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
164 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
165 /** |
2246 | 166 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
2221 | 167 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
168 */ |
7946 | 169 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
170 { |
2159 | 171 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2246 | 172 src+= stride*3; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
173 asm volatile( //"movv %0 %1 %2\n\t" |
7946 | 174 "movq %2, %%mm0 \n\t" // QP,..., QP |
175 "pxor %%mm4, %%mm4 \n\t" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
176 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
177 "movq (%0), %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
178 "movq (%0, %1), %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
179 "movq %%mm5, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
180 "movq %%mm6, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
181 "psubusb %%mm6, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
182 "psubusb %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
183 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
184 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
7946 | 185 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
186 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
187 "pand %%mm2, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
188 "pandn %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
189 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
190 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
191 "movq (%0, %1, 8), %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
192 "leal (%0, %1, 4), %%eax \n\t" |
7946 | 193 "leal (%0, %1, 8), %%ecx \n\t" |
194 "subl %1, %%ecx \n\t" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
195 "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
196 "movq (%0, %1, 8), %%mm7 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
197 "movq %%mm5, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
198 "movq %%mm7, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
199 "psubusb %%mm7, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
200 "psubusb %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
201 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
202 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
7946 | 203 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
204 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
205 "pand %%mm2, %%mm7 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
206 "pandn %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
207 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
208 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
209 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
210 // 1 2 3 4 5 6 7 8 |
7946 | 211 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
212 // 6 4 2 2 1 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
213 // 6 4 4 2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
214 // 6 8 2 |
2246 | 215 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
216 "movq (%0, %1), %%mm0 \n\t" // 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
217 "movq %%mm0, %%mm1 \n\t" // 1 |
2159 | 218 PAVGB(%%mm6, %%mm0) //1 1 /2 |
219 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
220 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
221 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
222 "movq %%mm2, %%mm5 \n\t" // 1 |
2159 | 223 PAVGB((%%eax), %%mm2) // 11 /2 |
224 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
225 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
226 "movq (%0), %%mm4 \n\t" // 1 |
2159 | 227 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
228 PAVGB(%%mm0, %%mm3) //642211 /16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
229 "movq %%mm3, (%0) \n\t" // X |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
230 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
231 "movq %%mm1, %%mm0 \n\t" // 1 |
2159 | 232 PAVGB(%%mm6, %%mm0) //1 1 /2 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
233 "movq %%mm4, %%mm3 \n\t" // 1 |
2159 | 234 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
235 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 | |
236 PAVGB((%%eax), %%mm5) // 211 /4 | |
237 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
238 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
239 "movq %%mm3, (%0,%1) \n\t" // X |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
240 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
2159 | 241 PAVGB(%%mm4, %%mm6) //11 /2 |
7946 | 242 "movq (%%ecx), %%mm0 \n\t" // 1 |
2159 | 243 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
244 "movq %%mm0, %%mm3 \n\t" // 11/2 |
2159 | 245 PAVGB(%%mm1, %%mm0) // 2 11/4 |
246 PAVGB(%%mm6, %%mm0) //222 11/8 | |
247 PAVGB(%%mm2, %%mm0) //22242211/16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
248 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
249 "movq %%mm0, (%0, %1, 2) \n\t" // X |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
250 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
251 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
7946 | 252 PAVGB((%%ecx), %%mm0) // 11 /2 |
2159 | 253 PAVGB(%%mm0, %%mm6) //11 11 /4 |
254 PAVGB(%%mm1, %%mm4) // 11 /2 | |
255 PAVGB(%%mm2, %%mm1) // 11 /2 | |
256 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
257 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
258 "movq (%%eax), %%mm5 \n\t" // 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
259 "movq %%mm6, (%%eax) \n\t" // X |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
260 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
261 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
2159 | 262 PAVGB(%%mm7, %%mm6) // 11 /2 |
263 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
264 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
265 PAVGB(%%mm5, %%mm2) // 11 /2 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
266 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
2159 | 267 PAVGB(%%mm4, %%mm2) // 112 /4 |
268 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
269 "movq %%mm6, (%0, %1, 4) \n\t" // X |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
270 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
2159 | 271 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
272 PAVGB(%%mm4, %%mm5) // 11 /2 | |
273 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
274 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
2159 | 275 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
276 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
277 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
278 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
7946 | 279 PAVGB((%%ecx), %%mm2) // 112 4 /8 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
280 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
2159 | 281 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
282 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
283 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
7946 | 284 "movq %%mm6, (%%ecx) \n\t" // X |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
285 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
2159 | 286 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
287 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
288 |
2159 | 289 PAVGB(%%mm3, %%mm0) // 112 /4 |
290 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
291 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
2570 | 292 "subl %1, %0 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
293 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
294 : |
7946 | 295 : "r" (src), "r" (stride), "m" (c->pQPb) |
296 : "%eax", "%ecx" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
297 ); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
298 #else |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
299 const int l1= stride; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
300 const int l2= stride + l1; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
301 const int l3= stride + l2; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
302 const int l4= stride + l3; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
303 const int l5= stride + l4; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
304 const int l6= stride + l5; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
305 const int l7= stride + l6; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
306 const int l8= stride + l7; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
307 const int l9= stride + l8; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
308 int x; |
2246 | 309 src+= stride*3; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
310 for(x=0; x<BLOCK_SIZE; x++) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
311 { |
7946 | 312 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
313 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
314 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
315 int sums[9]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
316 sums[0] = first + src[l1]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
317 sums[1] = src[l1] + src[l2]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
318 sums[2] = src[l2] + src[l3]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
319 sums[3] = src[l3] + src[l4]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
320 sums[4] = src[l4] + src[l5]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
321 sums[5] = src[l5] + src[l6]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
322 sums[6] = src[l6] + src[l7]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
323 sums[7] = src[l7] + src[l8]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
324 sums[8] = src[l8] + last; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
325 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
326 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
327 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
328 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
329 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
330 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
331 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
332 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
333 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
334 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
335 src++; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
336 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
337 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
338 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
339 |
7946 | 340 #if 0 |
2159 | 341 /** |
342 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
343 * values are correctly clipped (MMX2) | |
344 * values are wraparound (C) | |
345 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
346 0 8 16 24 | |
347 x = 8 | |
348 x/2 = 4 | |
349 x/8 = 1 | |
350 1 12 12 23 | |
351 */ | |
3099 | 352 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
2159 | 353 { |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
354 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2246 | 355 src+= stride*3; |
2159 | 356 // FIXME rounding |
357 asm volatile( | |
358 "pxor %%mm7, %%mm7 \n\t" // 0 | |
4248 | 359 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
2159 | 360 "leal (%0, %1), %%eax \n\t" |
7946 | 361 "leal (%%eax, %1, 4), %%ecx \n\t" |
2159 | 362 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 363 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
4248 | 364 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
2159 | 365 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
4248 | 366 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
2159 | 367 "psrlw $2, %%mm0 \n\t" |
4248 | 368 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
2159 | 369 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
370 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
7946 | 371 "movq (%%ecx), %%mm3 \n\t" // line 5 |
2159 | 372 "movq %%mm2, %%mm4 \n\t" // line 4 |
373 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
374 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
375 PAVGB(%%mm3, %%mm5) |
2159 | 376 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
377 "psubusb %%mm3, %%mm4 \n\t" | |
378 "psubusb %%mm2, %%mm3 \n\t" | |
379 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
380 "psubusb %%mm0, %%mm4 \n\t" | |
381 "pcmpeqb %%mm7, %%mm4 \n\t" | |
382 "pand %%mm4, %%mm5 \n\t" // d/2 | |
383 | |
384 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
385 "paddb %%mm5, %%mm2 \n\t" | |
386 // "psubb %%mm6, %%mm2 \n\t" | |
387 "movq %%mm2, (%0,%1, 4) \n\t" | |
388 | |
7946 | 389 "movq (%%ecx), %%mm2 \n\t" |
2159 | 390 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
391 "psubb %%mm5, %%mm2 \n\t" | |
392 // "psubb %%mm6, %%mm2 \n\t" | |
7946 | 393 "movq %%mm2, (%%ecx) \n\t" |
2159 | 394 |
395 "paddb %%mm6, %%mm5 \n\t" | |
396 "psrlw $2, %%mm5 \n\t" | |
4248 | 397 "pand "MANGLE(b3F)", %%mm5 \n\t" |
398 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
2159 | 399 |
400 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
401 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
402 "paddsb %%mm5, %%mm2 \n\t" | |
403 "psubb %%mm6, %%mm2 \n\t" | |
404 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
405 | |
7946 | 406 "movq (%%ecx, %1), %%mm2 \n\t" |
2159 | 407 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
408 "psubsb %%mm5, %%mm2 \n\t" | |
409 "psubb %%mm6, %%mm2 \n\t" | |
7946 | 410 "movq %%mm2, (%%ecx, %1) \n\t" |
2159 | 411 |
412 : | |
413 : "r" (src), "r" (stride) | |
7946 | 414 : "%eax", "%ecx" |
2159 | 415 ); |
416 #else | |
417 const int l1= stride; | |
418 const int l2= stride + l1; | |
419 const int l3= stride + l2; | |
420 const int l4= stride + l3; | |
421 const int l5= stride + l4; | |
422 const int l6= stride + l5; | |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
423 // const int l7= stride + l6; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
424 // const int l8= stride + l7; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
425 // const int l9= stride + l8; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
426 int x; |
2586 | 427 const int QP15= QP + (QP>>2); |
2246 | 428 src+= stride*3; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
429 for(x=0; x<BLOCK_SIZE; x++) |
2159 | 430 { |
2586 | 431 const int v = (src[x+l5] - src[x+l4]); |
432 if(ABS(v) < QP15) | |
2159 | 433 { |
2586 | 434 src[x+l3] +=v>>3; |
435 src[x+l4] +=v>>1; | |
436 src[x+l5] -=v>>1; | |
437 src[x+l6] -=v>>3; | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
438 |
2159 | 439 } |
440 } | |
441 | |
442 #endif | |
443 } | |
7946 | 444 #endif |
2159 | 445 |
446 /** | |
447 * Experimental Filter 1 | |
2179 | 448 * will not damage linear gradients |
449 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
450 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
451 * MMX2 version does correct clipping C version doesnt |
2159 | 452 */ |
7946 | 453 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
2159 | 454 { |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
455 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2246 | 456 src+= stride*3; |
457 | |
2159 | 458 asm volatile( |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
459 "pxor %%mm7, %%mm7 \n\t" // 0 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
460 "leal (%0, %1), %%eax \n\t" |
7946 | 461 "leal (%%eax, %1, 4), %%ecx \n\t" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
462 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 463 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
464 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
465 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
466 "movq %%mm1, %%mm2 \n\t" // line 4 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
467 "psubusb %%mm0, %%mm1 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
468 "psubusb %%mm2, %%mm0 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
469 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
7946 | 470 "movq (%%ecx), %%mm3 \n\t" // line 5 |
471 "movq (%%ecx, %1), %%mm4 \n\t" // line 6 | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
472 "movq %%mm3, %%mm5 \n\t" // line 5 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
473 "psubusb %%mm4, %%mm3 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
474 "psubusb %%mm5, %%mm4 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
475 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
476 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
477 "movq %%mm2, %%mm1 \n\t" // line 4 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
478 "psubusb %%mm5, %%mm2 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
479 "movq %%mm2, %%mm4 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
480 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
481 "psubusb %%mm1, %%mm5 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
482 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
483 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
484 "movq %%mm4, %%mm3 \n\t" // d |
7946 | 485 "movq %2, %%mm0 \n\t" |
5787 | 486 "paddusb %%mm0, %%mm0 \n\t" |
487 "psubusb %%mm0, %%mm4 \n\t" | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
488 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
4248 | 489 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
490 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
491 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
492 PAVGB(%%mm7, %%mm3) // d/2 |
2179 | 493 "movq %%mm3, %%mm1 \n\t" // d/2 |
494 PAVGB(%%mm7, %%mm3) // d/4 | |
495 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
496 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
497 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
498 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
499 "psubusb %%mm3, %%mm0 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
500 "pxor %%mm2, %%mm0 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
501 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
502 |
7946 | 503 "movq (%%ecx), %%mm0 \n\t" // line 5 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
504 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
505 "paddusb %%mm3, %%mm0 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
506 "pxor %%mm2, %%mm0 \n\t" |
7946 | 507 "movq %%mm0, (%%ecx) \n\t" // line 5 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
508 |
2179 | 509 PAVGB(%%mm7, %%mm1) // d/4 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
510 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
512 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
2179 | 513 "psubusb %%mm1, %%mm0 \n\t" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
514 "pxor %%mm2, %%mm0 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
515 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
516 |
7946 | 517 "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
518 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
2179 | 519 "paddusb %%mm1, %%mm0 \n\t" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
520 "pxor %%mm2, %%mm0 \n\t" |
7946 | 521 "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
522 |
2179 | 523 PAVGB(%%mm7, %%mm1) // d/8 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
524 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
525 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
526 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
2179 | 527 "psubusb %%mm1, %%mm0 \n\t" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
528 "pxor %%mm2, %%mm0 \n\t" |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
529 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
530 |
7946 | 531 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
532 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
2179 | 533 "paddusb %%mm1, %%mm0 \n\t" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
534 "pxor %%mm2, %%mm0 \n\t" |
7946 | 535 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
2159 | 536 |
537 : | |
7946 | 538 : "r" (src), "r" (stride), "m" (co->pQPb) |
539 : "%eax", "%ecx" | |
2159 | 540 ); |
541 #else | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
542 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
543 const int l1= stride; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
544 const int l2= stride + l1; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
545 const int l3= stride + l2; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
546 const int l4= stride + l3; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
547 const int l5= stride + l4; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
548 const int l6= stride + l5; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
549 const int l7= stride + l6; |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
550 // const int l8= stride + l7; |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
551 // const int l9= stride + l8; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
552 int x; |
2246 | 553 |
554 src+= stride*3; | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
555 for(x=0; x<BLOCK_SIZE; x++) |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
556 { |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
557 int a= src[l3] - src[l4]; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
558 int b= src[l4] - src[l5]; |
2179 | 559 int c= src[l5] - src[l6]; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
560 |
2586 | 561 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
562 d= MAX(d, 0); | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
563 |
7946 | 564 if(d < co->QP*2) |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
565 { |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
566 int v = d * SIGN(-b); |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
567 |
2586 | 568 src[l2] +=v>>3; |
569 src[l3] +=v>>2; | |
570 src[l4] +=(3*v)>>3; | |
571 src[l5] -=(3*v)>>3; | |
572 src[l6] -=v>>2; | |
573 src[l7] -=v>>3; | |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
574 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
575 } |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
576 src++; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
577 } |
2159 | 578 #endif |
579 } | |
580 | |
7946 | 581 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
582 { |
3013 | 583 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
584 /* | |
585 uint8_t tmp[16]; | |
586 const int l1= stride; | |
587 const int l2= stride + l1; | |
588 const int l3= stride + l2; | |
589 const int l4= (int)tmp - (int)src - stride*3; | |
590 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
591 const int l6= stride*3 + l3; | |
592 const int l7= stride + l6; | |
593 const int l8= stride + l7; | |
594 | |
595 memcpy(tmp, src+stride*7, 8); | |
596 memcpy(tmp+8, src+stride*8, 8); | |
597 */ | |
2246 | 598 src+= stride*4; |
3013 | 599 asm volatile( |
600 | |
601 #if 0 //sligtly more accurate and slightly slower | |
602 "pxor %%mm7, %%mm7 \n\t" // 0 | |
603 "leal (%0, %1), %%eax \n\t" | |
7946 | 604 "leal (%%eax, %1, 4), %%ecx \n\t" |
3013 | 605 // 0 1 2 3 4 5 6 7 |
7946 | 606 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
607 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
3013 | 608 |
609 | |
610 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
611 "movq (%0), %%mm1 \n\t" // l0 | |
612 "movq %%mm0, %%mm2 \n\t" // l2 | |
613 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
614 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
615 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
616 | |
617 "movq (%%eax), %%mm1 \n\t" // l1 | |
618 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |
619 "movq %%mm1, %%mm4 \n\t" // l1 | |
620 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
621 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
622 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
623 | |
624 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
625 "psubusb %%mm1, %%mm0 \n\t" | |
626 "psubusb %%mm4, %%mm1 \n\t" | |
627 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
628 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
629 | |
630 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
631 "movq %%mm0, %%mm4 \n\t" // l4 | |
632 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
633 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
634 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
635 | |
7946 | 636 "movq (%%ecx), %%mm2 \n\t" // l5 |
3013 | 637 "movq %%mm3, %%mm5 \n\t" // l3 |
638 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
639 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
640 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
641 | |
642 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
643 "psubusb %%mm3, %%mm0 \n\t" | |
644 "psubusb %%mm6, %%mm3 \n\t" | |
645 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
646 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
647 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
648 | |
7946 | 649 "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
3013 | 650 "movq %%mm6, %%mm5 \n\t" // l6 |
651 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
652 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
653 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
654 | |
7946 | 655 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
3013 | 656 "movq %%mm2, %%mm4 \n\t" // l5 |
657 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
658 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
659 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
660 | |
661 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
662 "psubusb %%mm2, %%mm6 \n\t" | |
663 "psubusb %%mm4, %%mm2 \n\t" | |
664 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
665 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
666 | |
667 | |
668 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
7946 | 669 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
4248 | 670 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
3013 | 671 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
672 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
673 "pand %%mm4, %%mm3 \n\t" | |
674 | |
675 "movq %%mm3, %%mm1 \n\t" | |
4248 | 676 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
3013 | 677 PAVGB(%%mm7, %%mm3) |
678 PAVGB(%%mm7, %%mm3) | |
679 "paddusb %%mm1, %%mm3 \n\t" | |
4248 | 680 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
3013 | 681 |
682 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |
683 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
684 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
685 "psubusb %%mm6, %%mm5 \n\t" | |
686 "psubusb %%mm4, %%mm6 \n\t" | |
687 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
688 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
689 "pxor %%mm6, %%mm0 \n\t" | |
690 "pand %%mm0, %%mm3 \n\t" | |
691 PMINUB(%%mm5, %%mm3, %%mm0) | |
692 | |
4248 | 693 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
3013 | 694 PAVGB(%%mm7, %%mm3) |
695 | |
696 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
697 "movq (%0, %1, 4), %%mm2 \n\t" | |
698 "pxor %%mm6, %%mm0 \n\t" | |
699 "pxor %%mm6, %%mm2 \n\t" | |
700 "psubb %%mm3, %%mm0 \n\t" | |
701 "paddb %%mm3, %%mm2 \n\t" | |
702 "pxor %%mm6, %%mm0 \n\t" | |
703 "pxor %%mm6, %%mm2 \n\t" | |
704 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
705 "movq %%mm2, (%0, %1, 4) \n\t" | |
706 #endif | |
707 | |
708 "leal (%0, %1), %%eax \n\t" | |
709 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
710 // 0 1 2 3 4 5 6 7 | |
7946 | 711 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
712 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
3013 | 713 |
714 | |
715 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |
716 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
717 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
718 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
719 // mm1=-l3-1, mm0=128-q | |
720 | |
721 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |
722 "movq (%%eax, %1), %%mm3 \n\t" // l2 | |
723 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
724 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
4248 | 725 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
7946 | 726 "leal (%%eax, %1, 4), %%ecx \n\t" |
3013 | 727 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
728 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
729 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
730 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
731 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
732 | |
733 "movq (%%eax), %%mm2 \n\t" // l1 | |
734 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
735 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
736 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
4248 | 737 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
3013 | 738 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
739 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
740 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
741 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
742 | |
7946 | 743 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
744 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 | |
3013 | 745 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
746 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
4248 | 747 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
3013 | 748 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
749 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
750 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
751 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
752 | |
4248 | 753 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
754 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
3013 | 755 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
756 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
757 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
758 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
759 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
760 | |
761 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
762 | |
4248 | 763 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
7946 | 764 "movq %2, %%mm2 \n\t" // QP |
3013 | 765 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
766 "psubb %%mm6, %%mm2 \n\t" | |
767 | |
768 "movq %%mm4, %%mm1 \n\t" | |
769 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
770 "pxor %%mm1, %%mm4 \n\t" | |
771 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
772 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
773 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
774 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
775 | |
776 "movq %%mm4, %%mm3 \n\t" // d | |
4248 | 777 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
3013 | 778 PAVGB(%%mm7, %%mm4) // d/32 |
779 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
780 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
781 "pand %%mm2, %%mm4 \n\t" | |
782 | |
4248 | 783 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
3013 | 784 "psubb %%mm0, %%mm5 \n\t" // q |
785 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
786 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
787 "pxor %%mm7, %%mm5 \n\t" | |
788 | |
789 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
790 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
791 | |
792 "pand %%mm7, %%mm4 \n\t" | |
793 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
794 "movq (%0, %1, 4), %%mm2 \n\t" | |
795 "pxor %%mm1, %%mm0 \n\t" | |
796 "pxor %%mm1, %%mm2 \n\t" | |
797 "paddb %%mm4, %%mm0 \n\t" | |
798 "psubb %%mm4, %%mm2 \n\t" | |
799 "pxor %%mm1, %%mm0 \n\t" | |
800 "pxor %%mm1, %%mm2 \n\t" | |
801 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
802 "movq %%mm2, (%0, %1, 4) \n\t" | |
803 | |
804 : | |
7946 | 805 : "r" (src), "r" (stride), "m" (c->pQPb) |
806 : "%eax", "%ecx" | |
3013 | 807 ); |
808 | |
809 /* | |
810 { | |
811 int x; | |
812 src-= stride; | |
813 for(x=0; x<BLOCK_SIZE; x++) | |
814 { | |
815 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
816 if(ABS(middleEnergy)< 8*QP) | |
817 { | |
818 const int q=(src[l4] - src[l5])/2; | |
819 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
820 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
821 | |
822 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
823 d= MAX(d, 0); | |
824 | |
825 d= (5*d + 32) >> 6; | |
826 d*= SIGN(-middleEnergy); | |
827 | |
828 if(q>0) | |
829 { | |
830 d= d<0 ? 0 : d; | |
831 d= d>q ? q : d; | |
832 } | |
833 else | |
834 { | |
835 d= d>0 ? 0 : d; | |
836 d= d<q ? q : d; | |
837 } | |
838 | |
839 src[l4]-= d; | |
840 src[l5]+= d; | |
841 } | |
842 src++; | |
843 } | |
844 src-=8; | |
845 for(x=0; x<8; x++) | |
846 { | |
847 int y; | |
848 for(y=4; y<6; y++) | |
849 { | |
850 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
851 int ad= ABS(d); | |
852 static int max=0; | |
853 static int sum=0; | |
854 static int num=0; | |
855 static int bias=0; | |
856 | |
857 if(max<ad) max=ad; | |
858 sum+= ad>3 ? 1 : 0; | |
859 if(ad>3) | |
860 { | |
861 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
862 } | |
863 if(y==4) bias+=d; | |
864 num++; | |
865 if(num%1000000 == 0) | |
866 { | |
867 printf(" %d %d %d %d\n", num, sum, max, bias); | |
868 } | |
869 } | |
870 } | |
871 } | |
872 */ | |
873 #elif defined (HAVE_MMX) | |
874 src+= stride*4; | |
875 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
876 asm volatile( |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
877 "pxor %%mm7, %%mm7 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
878 "leal (%0, %1), %%eax \n\t" |
7946 | 879 "leal (%%eax, %1, 4), %%edx \n\t" |
880 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars | |
881 "andl $0xFFFFFFF8, %%ecx \n\t" // align | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
882 // 0 1 2 3 4 5 6 7 |
7946 | 883 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
884 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
885 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
886 "movq (%0), %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
887 "movq %%mm0, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
888 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
889 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
890 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
891 "movq (%%eax), %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
892 "movq %%mm2, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
893 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
894 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
895 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
896 "movq (%%eax, %1), %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
897 "movq %%mm4, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
898 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
899 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
900 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
901 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
902 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
903 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
904 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
905 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
906 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
907 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
908 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
909 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
910 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
911 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
912 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
913 "movq (%%eax, %1, 2), %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
914 "movq %%mm2, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
915 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
916 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
917 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
918 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
919 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
920 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
921 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
7946 | 922 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
923 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
924 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
925 "movq (%0, %1, 4), %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
926 "movq %%mm0, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
927 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
928 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
929 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
930 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
931 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
7946 | 932 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
933 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
934 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
935 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
936 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
937 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
938 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
939 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
940 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
941 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
942 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
943 //50 opcodes so far |
7946 | 944 "movq (%%edx), %%mm2 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
945 "movq %%mm2, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
946 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
947 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
950 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
951 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
952 |
7946 | 953 "movq (%%edx, %1), %%mm6 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
954 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
955 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
7946 | 956 "movq (%%edx, %1), %%mm6 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
957 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
958 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
959 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
960 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
961 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
962 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
963 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
964 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
965 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
966 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
967 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
969 |
7946 | 970 "movq (%%edx, %1, 2), %%mm2 \n\t" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
971 "movq %%mm2, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
973 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
974 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
975 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
976 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
977 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
978 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
979 |
7946 | 980 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
981 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
2570 | 982 |
983 #ifdef HAVE_MMX2 | |
984 "movq %%mm7, %%mm6 \n\t" // 0 | |
985 "psubw %%mm0, %%mm6 \n\t" | |
986 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
987 "movq %%mm7, %%mm6 \n\t" // 0 | |
988 "psubw %%mm1, %%mm6 \n\t" | |
989 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
990 "movq %%mm7, %%mm6 \n\t" // 0 | |
991 "psubw %%mm2, %%mm6 \n\t" | |
992 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
993 "movq %%mm7, %%mm6 \n\t" // 0 | |
994 "psubw %%mm3, %%mm6 \n\t" | |
995 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
996 #else | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
997 "movq %%mm7, %%mm6 \n\t" // 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
998 "pcmpgtw %%mm0, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
999 "pxor %%mm6, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1000 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1001 "movq %%mm7, %%mm6 \n\t" // 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1002 "pcmpgtw %%mm1, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1003 "pxor %%mm6, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1004 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1005 "movq %%mm7, %%mm6 \n\t" // 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1006 "pcmpgtw %%mm2, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1007 "pxor %%mm6, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1008 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1009 "movq %%mm7, %%mm6 \n\t" // 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1010 "pcmpgtw %%mm3, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1011 "pxor %%mm6, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1012 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
2570 | 1013 #endif |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1014 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1015 #ifdef HAVE_MMX2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1016 "pminsw %%mm2, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1017 "pminsw %%mm3, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1018 #else |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1019 "movq %%mm0, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1020 "psubusw %%mm2, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1021 "psubw %%mm6, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1022 "movq %%mm1, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1023 "psubusw %%mm3, %%mm6 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1024 "psubw %%mm6, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1025 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1026 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1027 "movq %%mm7, %%mm6 \n\t" // 0 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1028 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1029 "pxor %%mm6, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1030 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1031 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1032 "pxor %%mm7, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1033 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1034 // 100 opcodes |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1035 "movd %2, %%mm2 \n\t" // QP |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1036 "psllw $3, %%mm2 \n\t" // 8QP |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1037 "movq %%mm2, %%mm3 \n\t" // 8QP |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1038 "pcmpgtw %%mm4, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1039 "pcmpgtw %%mm5, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1040 "pand %%mm2, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1041 "pand %%mm3, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1042 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 "psubusw %%mm0, %%mm4 \n\t" // hd |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1045 "psubusw %%mm1, %%mm5 \n\t" // ld |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1046 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 |
4253 | 1048 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1049 "pmullw %%mm2, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1050 "pmullw %%mm2, %%mm5 \n\t" |
4253 | 1051 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1052 "paddw %%mm2, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1053 "paddw %%mm2, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1054 "psrlw $6, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1055 "psrlw $6, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1056 |
7946 | 1057 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
1058 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1059 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1060 "pxor %%mm2, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1061 "pxor %%mm3, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1062 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1063 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1064 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1065 "pxor %%mm2, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1066 "pxor %%mm3, %%mm1 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1067 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1068 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1069 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1070 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1071 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1072 "pxor %%mm6, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1073 "pxor %%mm7, %%mm3 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1074 "pand %%mm2, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1075 "pand %%mm3, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1076 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1077 #ifdef HAVE_MMX2 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1078 "pminsw %%mm0, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1079 "pminsw %%mm1, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1080 #else |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1081 "movq %%mm4, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1082 "psubusw %%mm0, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1083 "psubw %%mm2, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1084 "movq %%mm5, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1085 "psubusw %%mm1, %%mm2 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1086 "psubw %%mm2, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1087 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1088 "pxor %%mm6, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1089 "pxor %%mm7, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1090 "psubw %%mm6, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1091 "psubw %%mm7, %%mm5 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1092 "packsswb %%mm5, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1093 "movq (%%eax, %1, 2), %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1094 "paddb %%mm4, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1095 "movq %%mm0, (%%eax, %1, 2) \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1096 "movq (%0, %1, 4), %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1097 "psubb %%mm4, %%mm0 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1098 "movq %%mm0, (%0, %1, 4) \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1099 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1100 : |
7946 | 1101 : "r" (src), "r" (stride), "m" (c->pQPb) |
1102 : "%eax", "%edx", "%ecx" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1103 ); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1104 #else |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1105 const int l1= stride; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1106 const int l2= stride + l1; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1107 const int l3= stride + l2; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 const int l4= stride + l3; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1109 const int l5= stride + l4; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1110 const int l6= stride + l5; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1111 const int l7= stride + l6; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1112 const int l8= stride + l7; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1113 // const int l9= stride + l8; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
1114 int x; |
2246 | 1115 src+= stride*3; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
1116 for(x=0; x<BLOCK_SIZE; x++) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1117 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1118 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
7946 | 1119 if(ABS(middleEnergy) < 8*c->QP) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1120 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1121 const int q=(src[l4] - src[l5])/2; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1122 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1123 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1124 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1125 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1126 d= MAX(d, 0); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1127 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1128 d= (5*d + 32) >> 6; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1129 d*= SIGN(-middleEnergy); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1130 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1131 if(q>0) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1132 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1133 d= d<0 ? 0 : d; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1134 d= d>q ? q : d; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1135 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1136 else |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1137 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1138 d= d>0 ? 0 : d; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1139 d= d<q ? q : d; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1140 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1141 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1142 src[l4]-= d; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1143 src[l5]+= d; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1144 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1145 src++; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1146 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1147 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1148 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1149 |
7946 | 1150 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1151 { |
2475 | 1152 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1153 asm volatile( |
7946 | 1154 "pxor %%mm6, %%mm6 \n\t" |
1155 "pcmpeqb %%mm7, %%mm7 \n\t" | |
1156 "movq %2, %%mm0 \n\t" | |
1157 "punpcklbw %%mm6, %%mm0 \n\t" | |
1158 "psrlw $1, %%mm0 \n\t" | |
1159 "psubw %%mm7, %%mm0 \n\t" | |
1160 "packuswb %%mm0, %%mm0 \n\t" | |
1161 "movq %%mm0, %3 \n\t" | |
2473 | 1162 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1163 "leal (%0, %1), %%eax \n\t" |
7946 | 1164 "leal (%%eax, %1, 4), %%edx \n\t" |
1165 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1166 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1167 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1168 |
3099 | 1169 #undef FIND_MIN_MAX |
2475 | 1170 #ifdef HAVE_MMX2 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1171 #define FIND_MIN_MAX(addr)\ |
2473 | 1172 "movq " #addr ", %%mm0 \n\t"\ |
3093 | 1173 "pminub %%mm0, %%mm7 \n\t"\ |
1174 "pmaxub %%mm0, %%mm6 \n\t" | |
2475 | 1175 #else |
1176 #define FIND_MIN_MAX(addr)\ | |
1177 "movq " #addr ", %%mm0 \n\t"\ | |
3093 | 1178 "movq %%mm7, %%mm1 \n\t"\ |
1179 "psubusb %%mm0, %%mm6 \n\t"\ | |
1180 "paddb %%mm0, %%mm6 \n\t"\ | |
2475 | 1181 "psubusb %%mm0, %%mm1 \n\t"\ |
3093 | 1182 "psubb %%mm1, %%mm7 \n\t" |
2475 | 1183 #endif |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1184 |
2473 | 1185 FIND_MIN_MAX((%%eax)) |
1186 FIND_MIN_MAX((%%eax, %1)) | |
1187 FIND_MIN_MAX((%%eax, %1, 2)) | |
1188 FIND_MIN_MAX((%0, %1, 4)) | |
7946 | 1189 FIND_MIN_MAX((%%edx)) |
1190 FIND_MIN_MAX((%%edx, %1)) | |
1191 FIND_MIN_MAX((%%edx, %1, 2)) | |
2473 | 1192 FIND_MIN_MAX((%0, %1, 8)) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1193 |
3093 | 1194 "movq %%mm7, %%mm4 \n\t" |
1195 "psrlq $8, %%mm7 \n\t" | |
1196 #ifdef HAVE_MMX2 | |
1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1198 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
1199 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1200 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
1201 "pminub %%mm4, %%mm7 \n\t" | |
1202 #else | |
1203 "movq %%mm7, %%mm1 \n\t" | |
1204 "psubusb %%mm4, %%mm1 \n\t" | |
1205 "psubb %%mm1, %%mm7 \n\t" | |
1206 "movq %%mm7, %%mm4 \n\t" | |
1207 "psrlq $16, %%mm7 \n\t" | |
1208 "movq %%mm7, %%mm1 \n\t" | |
1209 "psubusb %%mm4, %%mm1 \n\t" | |
1210 "psubb %%mm1, %%mm7 \n\t" | |
1211 "movq %%mm7, %%mm4 \n\t" | |
1212 "psrlq $32, %%mm7 \n\t" | |
1213 "movq %%mm7, %%mm1 \n\t" | |
1214 "psubusb %%mm4, %%mm1 \n\t" | |
1215 "psubb %%mm1, %%mm7 \n\t" | |
1216 #endif | |
1217 | |
1218 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1219 "movq %%mm6, %%mm4 \n\t" |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1220 "psrlq $8, %%mm6 \n\t" |
2475 | 1221 #ifdef HAVE_MMX2 |
3093 | 1222 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1223 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
3093 | 1224 "pmaxub %%mm4, %%mm6 \n\t" |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1225 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
3093 | 1226 "pmaxub %%mm4, %%mm6 \n\t" |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1227 #else |
3093 | 1228 "psubusb %%mm4, %%mm6 \n\t" |
1229 "paddb %%mm4, %%mm6 \n\t" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1230 "movq %%mm6, %%mm4 \n\t" |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1231 "psrlq $16, %%mm6 \n\t" |
3093 | 1232 "psubusb %%mm4, %%mm6 \n\t" |
1233 "paddb %%mm4, %%mm6 \n\t" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1234 "movq %%mm6, %%mm4 \n\t" |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1235 "psrlq $32, %%mm6 \n\t" |
3093 | 1236 "psubusb %%mm4, %%mm6 \n\t" |
1237 "paddb %%mm4, %%mm6 \n\t" | |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1238 #endif |
3093 | 1239 "movq %%mm6, %%mm0 \n\t" // max |
1240 "psubb %%mm7, %%mm6 \n\t" // max - min | |
1241 "movd %%mm6, %%ecx \n\t" | |
4248 | 1242 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
3093 | 1243 " jb 1f \n\t" |
7946 | 1244 "leal -24(%%esp), %%ecx \n\t" |
1245 "andl $0xFFFFFFF8, %%ecx \n\t" | |
3093 | 1246 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1247 "punpcklbw %%mm7, %%mm7 \n\t" |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1248 "punpcklbw %%mm7, %%mm7 \n\t" |
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1249 "punpcklbw %%mm7, %%mm7 \n\t" |
7946 | 1250 "movq %%mm7, (%%ecx) \n\t" |
2473 | 1251 |
1252 "movq (%0), %%mm0 \n\t" // L10 | |
1253 "movq %%mm0, %%mm1 \n\t" // L10 | |
1254 "movq %%mm0, %%mm2 \n\t" // L10 | |
1255 "psllq $8, %%mm1 \n\t" | |
1256 "psrlq $8, %%mm2 \n\t" | |
1257 "movd -4(%0), %%mm3 \n\t" | |
1258 "movd 8(%0), %%mm4 \n\t" | |
1259 "psrlq $24, %%mm3 \n\t" | |
1260 "psllq $56, %%mm4 \n\t" | |
1261 "por %%mm3, %%mm1 \n\t" // L00 | |
1262 "por %%mm4, %%mm2 \n\t" // L20 | |
1263 "movq %%mm1, %%mm3 \n\t" // L00 | |
1264 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
1265 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
1266 "psubusb %%mm7, %%mm0 \n\t" | |
1267 "psubusb %%mm7, %%mm2 \n\t" | |
1268 "psubusb %%mm7, %%mm3 \n\t" | |
4248 | 1269 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
1270 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
1271 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
2473 | 1272 "paddb %%mm2, %%mm0 \n\t" |
1273 "paddb %%mm3, %%mm0 \n\t" | |
1274 | |
1275 "movq (%%eax), %%mm2 \n\t" // L11 | |
1276 "movq %%mm2, %%mm3 \n\t" // L11 | |
1277 "movq %%mm2, %%mm4 \n\t" // L11 | |
1278 "psllq $8, %%mm3 \n\t" | |
1279 "psrlq $8, %%mm4 \n\t" | |
1280 "movd -4(%%eax), %%mm5 \n\t" | |
1281 "movd 8(%%eax), %%mm6 \n\t" | |
1282 "psrlq $24, %%mm5 \n\t" | |
1283 "psllq $56, %%mm6 \n\t" | |
1284 "por %%mm5, %%mm3 \n\t" // L01 | |
1285 "por %%mm6, %%mm4 \n\t" // L21 | |
1286 "movq %%mm3, %%mm5 \n\t" // L01 | |
1287 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
1288 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
1289 "psubusb %%mm7, %%mm2 \n\t" | |
1290 "psubusb %%mm7, %%mm4 \n\t" | |
1291 "psubusb %%mm7, %%mm5 \n\t" | |
4248 | 1292 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
1293 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
1294 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
2473 | 1295 "paddb %%mm4, %%mm2 \n\t" |
1296 "paddb %%mm5, %%mm2 \n\t" | |
1297 // 0, 2, 3, 1 | |
1298 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | |
1299 "movq " #src ", " #sx " \n\t" /* src[0] */\ | |
1300 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
1301 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
1302 "psllq $8, " #lx " \n\t"\ | |
1303 "psrlq $8, " #t0 " \n\t"\ | |
1304 "movd -4" #src ", " #t1 " \n\t"\ | |
1305 "psrlq $24, " #t1 " \n\t"\ | |
1306 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
1307 "movd 8" #src ", " #t1 " \n\t"\ | |
1308 "psllq $56, " #t1 " \n\t"\ | |
1309 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
1310 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
1311 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
1312 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
2478 | 1313 PAVGB(lx, pplx) \ |
7946 | 1314 "movq " #lx ", 8(%%ecx) \n\t"\ |
1315 "movq (%%ecx), " #lx " \n\t"\ | |
2570 | 1316 "psubusb " #lx ", " #t1 " \n\t"\ |
1317 "psubusb " #lx ", " #t0 " \n\t"\ | |
1318 "psubusb " #lx ", " #sx " \n\t"\ | |
4248 | 1319 "movq "MANGLE(b00)", " #lx " \n\t"\ |
2570 | 1320 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
1321 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
1322 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
2473 | 1323 "paddb " #t1 ", " #t0 " \n\t"\ |
1324 "paddb " #t0 ", " #sx " \n\t"\ | |
1325 \ | |
1326 PAVGB(plx, pplx) /* filtered */\ | |
1327 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
2477 | 1328 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
7946 | 1329 "psubusb %3, " #t0 " \n\t"\ |
1330 "paddusb %3, " #t1 " \n\t"\ | |
2477 | 1331 PMAXUB(t0, pplx)\ |
1332 PMINUB(t1, pplx, t0)\ | |
2473 | 1333 "paddb " #sx ", " #ppsx " \n\t"\ |
1334 "paddb " #psx ", " #ppsx " \n\t"\ | |
4248 | 1335 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
1336 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
2570 | 1337 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
2477 | 1338 "pand " #ppsx ", " #pplx " \n\t"\ |
2473 | 1339 "pandn " #dst ", " #ppsx " \n\t"\ |
2570 | 1340 "por " #pplx ", " #ppsx " \n\t"\ |
2478 | 1341 "movq " #ppsx ", " #dst " \n\t"\ |
7946 | 1342 "movq 8(%%ecx), " #lx " \n\t" |
2477 | 1343 |
2473 | 1344 /* |
1345 0000000 | |
1346 1111111 | |
1347 | |
1348 1111110 | |
1349 1111101 | |
1350 1111100 | |
1351 1111011 | |
1352 1111010 | |
1353 1111001 | |
1354 | |
1355 1111000 | |
1356 1110111 | |
1357 | |
1358 */ | |
1359 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
1360 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
1361 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
1362 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
7946 | 1363 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
1364 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
1365 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
1366 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
1367 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1368 |
3093 | 1369 "1: \n\t" |
7946 | 1370 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
1371 : "%eax", "%edx", "%ecx" | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1372 ); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1373 #else |
2477 | 1374 int y; |
1375 int min=255; | |
1376 int max=0; | |
1377 int avg; | |
1378 uint8_t *p; | |
1379 int s[10]; | |
7946 | 1380 const int QP2= c->QP/2 + 1; |
2477 | 1381 |
1382 for(y=1; y<9; y++) | |
1383 { | |
1384 int x; | |
1385 p= src + stride*y; | |
1386 for(x=1; x<9; x++) | |
1387 { | |
1388 p++; | |
1389 if(*p > max) max= *p; | |
1390 if(*p < min) min= *p; | |
1391 } | |
1392 } | |
7946 | 1393 avg= (min + max + 1)>>1; |
2477 | 1394 |
3093 | 1395 if(max - min <deringThreshold) return; |
1396 | |
2477 | 1397 for(y=0; y<10; y++) |
1398 { | |
1399 int t = 0; | |
7946 | 1400 |
1401 if(src[stride*y + 0] > avg) t+= 1; | |
1402 if(src[stride*y + 1] > avg) t+= 2; | |
1403 if(src[stride*y + 2] > avg) t+= 4; | |
1404 if(src[stride*y + 3] > avg) t+= 8; | |
1405 if(src[stride*y + 4] > avg) t+= 16; | |
1406 if(src[stride*y + 5] > avg) t+= 32; | |
1407 if(src[stride*y + 6] > avg) t+= 64; | |
1408 if(src[stride*y + 7] > avg) t+= 128; | |
1409 if(src[stride*y + 8] > avg) t+= 256; | |
1410 if(src[stride*y + 9] > avg) t+= 512; | |
1411 | |
2477 | 1412 t |= (~t)<<16; |
1413 t &= (t<<1) & (t>>1); | |
1414 s[y] = t; | |
1415 } | |
7946 | 1416 |
1417 for(y=1; y<9; y++) | |
1418 { | |
1419 int t = s[y-1] & s[y] & s[y+1]; | |
1420 t|= t>>16; | |
1421 s[y-1]= t; | |
1422 } | |
2477 | 1423 |
1424 for(y=1; y<9; y++) | |
1425 { | |
1426 int x; | |
7946 | 1427 int t = s[y-1]; |
2477 | 1428 |
1429 p= src + stride*y; | |
1430 for(x=1; x<9; x++) | |
1431 { | |
1432 p++; | |
1433 if(t & (1<<x)) | |
1434 { | |
1435 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
1436 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
1437 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
1438 f= (f + 8)>>4; | |
1439 | |
3093 | 1440 #ifdef DEBUG_DERING_THRESHOLD |
1441 asm volatile("emms\n\t":); | |
1442 { | |
1443 static long long numPixels=0; | |
1444 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
1445 // if((max-min)<20 || (max-min)*QP<200) | |
1446 // if((max-min)*QP < 500) | |
1447 // if(max-min<QP/2) | |
1448 if(max-min < 20) | |
1449 { | |
1450 static int numSkiped=0; | |
1451 static int errorSum=0; | |
1452 static int worstQP=0; | |
1453 static int worstRange=0; | |
1454 static int worstDiff=0; | |
1455 int diff= (f - *p); | |
1456 int absDiff= ABS(diff); | |
1457 int error= diff*diff; | |
1458 | |
1459 if(x==1 || x==8 || y==1 || y==8) continue; | |
1460 | |
1461 numSkiped++; | |
1462 if(absDiff > worstDiff) | |
1463 { | |
1464 worstDiff= absDiff; | |
1465 worstQP= QP; | |
1466 worstRange= max-min; | |
1467 } | |
1468 errorSum+= error; | |
1469 | |
1470 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
1471 { | |
1472 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
1473 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
1474 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
1475 worstDiff, (float)numSkiped/numPixels); | |
1476 } | |
1477 } | |
1478 } | |
1479 #endif | |
7946 | 1480 if (*p + QP2 < f) *p= *p + QP2; |
1481 else if(*p - QP2 > f) *p= *p - QP2; | |
2477 | 1482 else *p=f; |
1483 } | |
1484 } | |
1485 } | |
3093 | 1486 #ifdef DEBUG_DERING_THRESHOLD |
1487 if(max-min < 20) | |
1488 { | |
1489 for(y=1; y<9; y++) | |
1490 { | |
1491 int x; | |
1492 int t = 0; | |
1493 p= src + stride*y; | |
1494 for(x=1; x<9; x++) | |
1495 { | |
1496 p++; | |
1497 *p = MIN(*p + 20, 255); | |
1498 } | |
1499 } | |
1500 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
1501 } | |
1502 #endif | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1503 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1504 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1505 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1506 /** |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1507 * Deinterlaces the given block |
2595 | 1508 * will be called for every 8x8 block and can read & write from line 4-15 |
1509 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1510 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1511 */ |
3099 | 1512 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1513 { |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1514 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2595 | 1515 src+= 4*stride; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1516 asm volatile( |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1517 "leal (%0, %1), %%eax \n\t" |
7946 | 1518 "leal (%%eax, %1, 4), %%ecx \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1519 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1520 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1521 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1522 "movq (%0), %%mm0 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1523 "movq (%%eax, %1), %%mm1 \n\t" |
2246 | 1524 PAVGB(%%mm1, %%mm0) |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1525 "movq %%mm0, (%%eax) \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1526 "movq (%0, %1, 4), %%mm0 \n\t" |
2246 | 1527 PAVGB(%%mm0, %%mm1) |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1528 "movq %%mm1, (%%eax, %1, 2) \n\t" |
7946 | 1529 "movq (%%ecx, %1), %%mm1 \n\t" |
2246 | 1530 PAVGB(%%mm1, %%mm0) |
7946 | 1531 "movq %%mm0, (%%ecx) \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1532 "movq (%0, %1, 8), %%mm0 \n\t" |
2246 | 1533 PAVGB(%%mm0, %%mm1) |
7946 | 1534 "movq %%mm1, (%%ecx, %1, 2) \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1535 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1536 : : "r" (src), "r" (stride) |
7946 | 1537 : "%eax", "%ecx" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1538 ); |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1539 #else |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1540 int x; |
2595 | 1541 src+= 4*stride; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1542 for(x=0; x<8; x++) |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1543 { |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1544 src[stride] = (src[0] + src[stride*2])>>1; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1545 src[stride*3] = (src[stride*2] + src[stride*4])>>1; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1546 src[stride*5] = (src[stride*4] + src[stride*6])>>1; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1547 src[stride*7] = (src[stride*6] + src[stride*8])>>1; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1548 src++; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1549 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1550 #endif |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1551 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1552 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1553 /** |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1554 * Deinterlaces the given block |
2595 | 1555 * will be called for every 8x8 block and can read & write from line 4-15 |
1556 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1557 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
1558 * this filter will read lines 3-15 and write 7-13 | |
2246 | 1559 * no cliping in C version |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1560 */ |
3099 | 1561 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1562 { |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1563 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2595 | 1564 src+= stride*3; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1565 asm volatile( |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1566 "leal (%0, %1), %%eax \n\t" |
7946 | 1567 "leal (%%eax, %1, 4), %%edx \n\t" |
1568 "leal (%%edx, %1, 4), %%ecx \n\t" | |
2246 | 1569 "addl %1, %%ecx \n\t" |
1570 "pxor %%mm7, %%mm7 \n\t" | |
1571 // 0 1 2 3 4 5 6 7 8 9 10 | |
7946 | 1572 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1573 |
2246 | 1574 #define DEINT_CUBIC(a,b,c,d,e)\ |
1575 "movq " #a ", %%mm0 \n\t"\ | |
1576 "movq " #b ", %%mm1 \n\t"\ | |
1577 "movq " #d ", %%mm2 \n\t"\ | |
1578 "movq " #e ", %%mm3 \n\t"\ | |
1579 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
1580 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
1581 "movq %%mm0, %%mm2 \n\t"\ | |
1582 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1583 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
1584 "movq %%mm1, %%mm3 \n\t"\ | |
1585 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1586 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1587 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
1588 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
1589 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
1590 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
1591 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
1592 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
1593 "packuswb %%mm3, %%mm1 \n\t"\ | |
1594 "movq %%mm1, " #c " \n\t" | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1595 |
7946 | 1596 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
1597 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) | |
1598 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) | |
1599 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1600 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1601 : : "r" (src), "r" (stride) |
7946 | 1602 : "%eax", "%edx", "ecx" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1603 ); |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1604 #else |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1605 int x; |
2595 | 1606 src+= stride*3; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1607 for(x=0; x<8; x++) |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1608 { |
2246 | 1609 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
1610 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; | |
1611 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; | |
1612 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1613 src++; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1614 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1615 #endif |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1616 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1617 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1618 /** |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1619 * Deinterlaces the given block |
2595 | 1620 * will be called for every 8x8 block and can read & write from line 4-15 |
1621 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1622 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
7946 | 1623 * this filter will read lines 4-13 and write 5-11 |
1624 * no cliping in C version | |
1625 */ | |
1626 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
1627 { | |
1628 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1629 src+= stride*4; | |
1630 asm volatile( | |
1631 "leal (%0, %1), %%eax \n\t" | |
1632 "leal (%%eax, %1, 4), %%edx \n\t" | |
1633 "pxor %%mm7, %%mm7 \n\t" | |
1634 "movq (%2), %%mm0 \n\t" | |
1635 // 0 1 2 3 4 5 6 7 8 9 10 | |
1636 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
1637 | |
1638 #define DEINT_FF(a,b,c,d)\ | |
1639 "movq " #a ", %%mm1 \n\t"\ | |
1640 "movq " #b ", %%mm2 \n\t"\ | |
1641 "movq " #c ", %%mm3 \n\t"\ | |
1642 "movq " #d ", %%mm4 \n\t"\ | |
1643 PAVGB(%%mm3, %%mm1) \ | |
1644 PAVGB(%%mm4, %%mm0) \ | |
1645 "movq %%mm0, %%mm3 \n\t"\ | |
1646 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1647 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1648 "movq %%mm1, %%mm4 \n\t"\ | |
1649 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1650 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
1651 "psllw $2, %%mm1 \n\t"\ | |
1652 "psllw $2, %%mm4 \n\t"\ | |
1653 "psubw %%mm0, %%mm1 \n\t"\ | |
1654 "psubw %%mm3, %%mm4 \n\t"\ | |
1655 "movq %%mm2, %%mm5 \n\t"\ | |
1656 "movq %%mm2, %%mm0 \n\t"\ | |
1657 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1658 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1659 "paddw %%mm2, %%mm1 \n\t"\ | |
1660 "paddw %%mm5, %%mm4 \n\t"\ | |
1661 "psraw $2, %%mm1 \n\t"\ | |
1662 "psraw $2, %%mm4 \n\t"\ | |
1663 "packuswb %%mm4, %%mm1 \n\t"\ | |
1664 "movq %%mm1, " #b " \n\t"\ | |
1665 | |
1666 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) | |
1667 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) | |
1668 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) | |
1669 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) | |
1670 | |
1671 "movq %%mm0, (%2) \n\t" | |
1672 : : "r" (src), "r" (stride), "r"(tmp) | |
1673 : "%eax", "%edx" | |
1674 ); | |
1675 #else | |
1676 int x; | |
1677 src+= stride*4; | |
1678 for(x=0; x<8; x++) | |
1679 { | |
1680 int t1= tmp[x]; | |
1681 int t2= src[stride*1]; | |
1682 | |
1683 src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3; | |
1684 t1= src[stride*4]; | |
1685 src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3; | |
1686 t2= src[stride*6]; | |
1687 src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3; | |
1688 t1= src[stride*8]; | |
1689 src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3; | |
1690 tmp[x]= t1; | |
1691 | |
1692 src++; | |
1693 } | |
1694 #endif | |
1695 } | |
1696 | |
1697 /** | |
1698 * Deinterlaces the given block | |
1699 * will be called for every 8x8 block and can read & write from line 4-15 | |
1700 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1701 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1702 * will shift the image up by 1 line (FIXME if this is a problem) |
2595 | 1703 * this filter will read lines 4-13 and write 4-11 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1704 */ |
3099 | 1705 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride) |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1706 { |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2595 | 1708 src+= 4*stride; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1709 asm volatile( |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1710 "leal (%0, %1), %%eax \n\t" |
7946 | 1711 "leal (%%eax, %1, 4), %%edx \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1712 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1713 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1714 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1715 "movq (%0), %%mm0 \n\t" // L0 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1716 "movq (%%eax, %1), %%mm1 \n\t" // L2 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1717 PAVGB(%%mm1, %%mm0) // L0+L2 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1718 "movq (%%eax), %%mm2 \n\t" // L1 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1719 PAVGB(%%mm2, %%mm0) |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1720 "movq %%mm0, (%0) \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1721 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1722 PAVGB(%%mm0, %%mm2) // L1+L3 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1723 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1724 "movq %%mm2, (%%eax) \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1725 "movq (%0, %1, 4), %%mm2 \n\t" // L4 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1726 PAVGB(%%mm2, %%mm1) // L2+L4 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1727 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1728 "movq %%mm1, (%%eax, %1) \n\t" |
7946 | 1729 "movq (%%edx), %%mm1 \n\t" // L5 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1730 PAVGB(%%mm1, %%mm0) // L3+L5 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1731 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1732 "movq %%mm0, (%%eax, %1, 2) \n\t" |
7946 | 1733 "movq (%%edx, %1), %%mm0 \n\t" // L6 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1734 PAVGB(%%mm0, %%mm2) // L4+L6 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1735 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1736 "movq %%mm2, (%0, %1, 4) \n\t" |
7946 | 1737 "movq (%%edx, %1, 2), %%mm2 \n\t" // L7 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1738 PAVGB(%%mm2, %%mm1) // L5+L7 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1739 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
7946 | 1740 "movq %%mm1, (%%edx) \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1741 "movq (%0, %1, 8), %%mm1 \n\t" // L8 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1742 PAVGB(%%mm1, %%mm0) // L6+L8 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1743 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
7946 | 1744 "movq %%mm0, (%%edx, %1) \n\t" |
1745 "movq (%%edx, %1, 4), %%mm0 \n\t" // L9 | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1746 PAVGB(%%mm0, %%mm2) // L7+L9 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1747 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
7946 | 1748 "movq %%mm2, (%%edx, %1, 2) \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1749 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1750 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1751 : : "r" (src), "r" (stride) |
7946 | 1752 : "%eax", "%edx" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1753 ); |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1754 #else |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1755 int x; |
2595 | 1756 src+= 4*stride; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1757 for(x=0; x<8; x++) |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1758 { |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1759 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1760 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1761 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1762 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1763 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1764 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1765 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1766 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1767 src++; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1768 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1769 #endif |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1770 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1771 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1772 /** |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1773 * Deinterlaces the given block |
2595 | 1774 * will be called for every 8x8 block and can read & write from line 4-15, |
1775 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1776 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1777 */ |
3099 | 1778 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1779 { |
2221 | 1780 #ifdef HAVE_MMX |
2595 | 1781 src+= 4*stride; |
2221 | 1782 #ifdef HAVE_MMX2 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1783 asm volatile( |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1784 "leal (%0, %1), %%eax \n\t" |
7946 | 1785 "leal (%%eax, %1, 4), %%edx \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1786 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1787 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1788 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1789 "movq (%0), %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1790 "movq (%%eax, %1), %%mm2 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1791 "movq (%%eax), %%mm1 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1792 "movq %%mm0, %%mm3 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1793 "pmaxub %%mm1, %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1794 "pminub %%mm3, %%mm1 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1795 "pmaxub %%mm2, %%mm1 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1796 "pminub %%mm1, %%mm0 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1797 "movq %%mm0, (%%eax) \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1798 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1799 "movq (%0, %1, 4), %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1800 "movq (%%eax, %1, 2), %%mm1 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1801 "movq %%mm2, %%mm3 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1802 "pmaxub %%mm1, %%mm2 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1803 "pminub %%mm3, %%mm1 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1804 "pmaxub %%mm0, %%mm1 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1805 "pminub %%mm1, %%mm2 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1806 "movq %%mm2, (%%eax, %1, 2) \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1807 |
7946 | 1808 "movq (%%edx), %%mm2 \n\t" // |
1809 "movq (%%edx, %1), %%mm1 \n\t" // | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1810 "movq %%mm2, %%mm3 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1811 "pmaxub %%mm0, %%mm2 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1812 "pminub %%mm3, %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1813 "pmaxub %%mm1, %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1814 "pminub %%mm0, %%mm2 \n\t" |
7946 | 1815 "movq %%mm2, (%%edx) \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1816 |
7946 | 1817 "movq (%%edx, %1, 2), %%mm2 \n\t" // |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1818 "movq (%0, %1, 8), %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1819 "movq %%mm2, %%mm3 \n\t" |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1820 "pmaxub %%mm0, %%mm2 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1821 "pminub %%mm3, %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1822 "pmaxub %%mm1, %%mm0 \n\t" // |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1823 "pminub %%mm0, %%mm2 \n\t" |
7946 | 1824 "movq %%mm2, (%%edx, %1, 2) \n\t" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1825 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1826 |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1827 : : "r" (src), "r" (stride) |
7946 | 1828 : "%eax", "%edx" |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1829 ); |
2221 | 1830 |
1831 #else // MMX without MMX2 | |
1832 asm volatile( | |
1833 "leal (%0, %1), %%eax \n\t" | |
7946 | 1834 "leal (%%eax, %1, 4), %%edx \n\t" |
2221 | 1835 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1836 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
2221 | 1837 "pxor %%mm7, %%mm7 \n\t" |
1838 | |
1839 #define MEDIAN(a,b,c)\ | |
1840 "movq " #a ", %%mm0 \n\t"\ | |
1841 "movq " #b ", %%mm2 \n\t"\ | |
1842 "movq " #c ", %%mm1 \n\t"\ | |
1843 "movq %%mm0, %%mm3 \n\t"\ | |
1844 "movq %%mm1, %%mm4 \n\t"\ | |
1845 "movq %%mm2, %%mm5 \n\t"\ | |
1846 "psubusb %%mm1, %%mm3 \n\t"\ | |
1847 "psubusb %%mm2, %%mm4 \n\t"\ | |
1848 "psubusb %%mm0, %%mm5 \n\t"\ | |
1849 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
1850 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
1851 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
1852 "movq %%mm3, %%mm6 \n\t"\ | |
1853 "pxor %%mm4, %%mm3 \n\t"\ | |
1854 "pxor %%mm5, %%mm4 \n\t"\ | |
1855 "pxor %%mm6, %%mm5 \n\t"\ | |
1856 "por %%mm3, %%mm1 \n\t"\ | |
1857 "por %%mm4, %%mm2 \n\t"\ | |
1858 "por %%mm5, %%mm0 \n\t"\ | |
1859 "pand %%mm2, %%mm0 \n\t"\ | |
1860 "pand %%mm1, %%mm0 \n\t"\ | |
1861 "movq %%mm0, " #b " \n\t" | |
1862 | |
1863 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
1864 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
7946 | 1865 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
1866 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) | |
2221 | 1867 |
1868 : : "r" (src), "r" (stride) | |
7946 | 1869 : "%eax", "%edx" |
2221 | 1870 ); |
1871 #endif // MMX | |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1872 #else |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1873 //FIXME |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1874 int x; |
2595 | 1875 src+= 4*stride; |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1876 for(x=0; x<8; x++) |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1877 { |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1878 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1879 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1880 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1881 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1882 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1883 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1884 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1885 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1886 src++; |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1887 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1888 #endif |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1889 } |
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
1890 |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
1891 #ifdef HAVE_MMX |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1892 /** |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1893 * transposes and shift the given 8x8 Block into dst1 and dst2 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1894 */ |
3099 | 1895 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1896 { |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1897 asm( |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1898 "leal (%0, %1), %%eax \n\t" |
7946 | 1899 "leal (%%eax, %1, 4), %%edx \n\t" |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1900 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1901 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1902 "movq (%0), %%mm0 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1903 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1904 "movq %%mm0, %%mm2 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1905 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1906 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1907 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1908 "movq (%%eax, %1), %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1909 "movq (%%eax, %1, 2), %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1910 "movq %%mm1, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1911 "punpcklbw %%mm3, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1912 "punpckhbw %%mm3, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1913 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1914 "movq %%mm0, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1915 "punpcklwd %%mm1, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1916 "punpckhwd %%mm1, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1917 "movq %%mm2, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1918 "punpcklwd %%mm4, %%mm2 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1919 "punpckhwd %%mm4, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1920 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1921 "movd %%mm0, 128(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1922 "psrlq $32, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1923 "movd %%mm0, 144(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1924 "movd %%mm3, 160(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1925 "psrlq $32, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1926 "movd %%mm3, 176(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1927 "movd %%mm3, 48(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1928 "movd %%mm2, 192(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1929 "movd %%mm2, 64(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1930 "psrlq $32, %%mm2 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1931 "movd %%mm2, 80(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1932 "movd %%mm1, 96(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1933 "psrlq $32, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1934 "movd %%mm1, 112(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1935 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1936 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
7946 | 1937 "movq (%%edx), %%mm1 \n\t" // abcdefgh |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1938 "movq %%mm0, %%mm2 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1939 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1940 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1941 |
7946 | 1942 "movq (%%edx, %1), %%mm1 \n\t" |
1943 "movq (%%edx, %1, 2), %%mm3 \n\t" | |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1944 "movq %%mm1, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1945 "punpcklbw %%mm3, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1946 "punpckhbw %%mm3, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1947 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1948 "movq %%mm0, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1949 "punpcklwd %%mm1, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1950 "punpckhwd %%mm1, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1951 "movq %%mm2, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1952 "punpcklwd %%mm4, %%mm2 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1953 "punpckhwd %%mm4, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1954 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1955 "movd %%mm0, 132(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1956 "psrlq $32, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1957 "movd %%mm0, 148(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1958 "movd %%mm3, 164(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1959 "psrlq $32, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1960 "movd %%mm3, 180(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1961 "movd %%mm3, 52(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1962 "movd %%mm2, 196(%2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1963 "movd %%mm2, 68(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1964 "psrlq $32, %%mm2 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1965 "movd %%mm2, 84(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1966 "movd %%mm1, 100(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1967 "psrlq $32, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1968 "movd %%mm1, 116(%3) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1969 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1970 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1971 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
7946 | 1972 : "%eax", "%edx" |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1973 ); |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1974 } |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1975 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1976 /** |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1977 * transposes the given 8x8 block |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1978 */ |
3099 | 1979 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1980 { |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1981 asm( |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1982 "leal (%0, %1), %%eax \n\t" |
7946 | 1983 "leal (%%eax, %1, 4), %%edx \n\t" |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1984 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 1985 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1986 "movq (%2), %%mm0 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1987 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1988 "movq %%mm0, %%mm2 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1989 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1990 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1991 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1992 "movq 32(%2), %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1993 "movq 48(%2), %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1994 "movq %%mm1, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1995 "punpcklbw %%mm3, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1996 "punpckhbw %%mm3, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1997 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1998 "movq %%mm0, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
1999 "punpcklwd %%mm1, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2000 "punpckhwd %%mm1, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2001 "movq %%mm2, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2002 "punpcklwd %%mm4, %%mm2 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2003 "punpckhwd %%mm4, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2004 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2005 "movd %%mm0, (%0) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2006 "psrlq $32, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2007 "movd %%mm0, (%%eax) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2008 "movd %%mm3, (%%eax, %1) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2009 "psrlq $32, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2010 "movd %%mm3, (%%eax, %1, 2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2011 "movd %%mm2, (%0, %1, 4) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2012 "psrlq $32, %%mm2 \n\t" |
7946 | 2013 "movd %%mm2, (%%edx) \n\t" |
2014 "movd %%mm1, (%%edx, %1) \n\t" | |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2015 "psrlq $32, %%mm1 \n\t" |
7946 | 2016 "movd %%mm1, (%%edx, %1, 2) \n\t" |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2017 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2018 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2019 "movq 64(%2), %%mm0 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2020 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2021 "movq %%mm0, %%mm2 \n\t" // 12345678 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2022 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2023 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2024 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2025 "movq 96(%2), %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2026 "movq 112(%2), %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2027 "movq %%mm1, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2028 "punpcklbw %%mm3, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2029 "punpckhbw %%mm3, %%mm4 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2030 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2031 "movq %%mm0, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2032 "punpcklwd %%mm1, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2033 "punpckhwd %%mm1, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2034 "movq %%mm2, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2035 "punpcklwd %%mm4, %%mm2 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2036 "punpckhwd %%mm4, %%mm1 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2037 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2038 "movd %%mm0, 4(%0) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2039 "psrlq $32, %%mm0 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2040 "movd %%mm0, 4(%%eax) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2041 "movd %%mm3, 4(%%eax, %1) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2042 "psrlq $32, %%mm3 \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2043 "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2044 "movd %%mm2, 4(%0, %1, 4) \n\t" |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2045 "psrlq $32, %%mm2 \n\t" |
7946 | 2046 "movd %%mm2, 4(%%edx) \n\t" |
2047 "movd %%mm1, 4(%%edx, %1) \n\t" | |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2048 "psrlq $32, %%mm1 \n\t" |
7946 | 2049 "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2050 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2051 :: "r" (dst), "r" (dstStride), "r" (src) |
7946 | 2052 : "%eax", "%edx" |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2053 ); |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2054 } |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
2055 #endif |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2056 //static int test=0; |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2057 |
3099 | 2058 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
2899 | 2059 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
2860 | 2060 { |
7946 | 2061 // to save a register (FIXME do this outside of the loops) |
2062 tempBluredPast[127]= maxNoise[0]; | |
2063 tempBluredPast[128]= maxNoise[1]; | |
2064 tempBluredPast[129]= maxNoise[2]; | |
2065 | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2066 #define FAST_L2_DIFF |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2067 //#define L1_DIFF //u should change the thresholds too if u try that one |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2068 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2069 asm volatile( |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2070 "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
7946 | 2071 "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
2072 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2073 // 0 1 2 3 4 5 6 7 8 9 |
7946 | 2074 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2075 //FIXME reorder? |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2076 #ifdef L1_DIFF //needs mmx2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2077 "movq (%0), %%mm0 \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2078 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2079 "movq (%0, %2), %%mm1 \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2080 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2081 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2082 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2083 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2084 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2085 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2086 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2087 "paddw %%mm1, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2088 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
7946 | 2089 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2090 "paddw %%mm2, %%mm0 \n\t" |
7946 | 2091 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2092 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2093 "paddw %%mm3, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2094 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2095 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2096 "paddw %%mm4, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2097 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2098 "paddw %%mm5, %%mm6 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2099 "paddw %%mm7, %%mm6 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2100 "paddw %%mm6, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2101 #elif defined (FAST_L2_DIFF) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2102 "pcmpeqb %%mm7, %%mm7 \n\t" |
4248 | 2103 "movq "MANGLE(b80)", %%mm6 \n\t" |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2104 "pxor %%mm0, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2105 #define L2_DIFF_CORE(a, b)\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2106 "movq " #a ", %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2107 "movq " #b ", %%mm2 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2108 "pxor %%mm7, %%mm2 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2109 PAVGB(%%mm2, %%mm5)\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2110 "paddb %%mm6, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2111 "movq %%mm5, %%mm2 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2112 "psllw $8, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2113 "pmaddwd %%mm5, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2114 "pmaddwd %%mm2, %%mm2 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2115 "paddd %%mm2, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2116 "psrld $14, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2117 "paddd %%mm5, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2118 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2119 L2_DIFF_CORE((%0), (%1)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2120 L2_DIFF_CORE((%0, %2), (%1, %2)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2121 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2122 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2123 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
7946 | 2124 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2125 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2126 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2127 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2128 #else |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2129 "pxor %%mm7, %%mm7 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2130 "pxor %%mm0, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2131 #define L2_DIFF_CORE(a, b)\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2132 "movq " #a ", %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2133 "movq " #b ", %%mm2 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2134 "movq %%mm5, %%mm1 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2135 "movq %%mm2, %%mm3 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2136 "punpcklbw %%mm7, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2137 "punpckhbw %%mm7, %%mm1 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2138 "punpcklbw %%mm7, %%mm2 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2139 "punpckhbw %%mm7, %%mm3 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2140 "psubw %%mm2, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2141 "psubw %%mm3, %%mm1 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2142 "pmaddwd %%mm5, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2143 "pmaddwd %%mm1, %%mm1 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2144 "paddd %%mm1, %%mm5 \n\t"\ |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2145 "paddd %%mm5, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2146 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2147 L2_DIFF_CORE((%0), (%1)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2148 L2_DIFF_CORE((%0, %2), (%1, %2)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2149 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2150 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2151 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
7946 | 2152 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2153 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2154 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2155 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2156 #endif |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2157 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2158 "movq %%mm0, %%mm4 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2159 "psrlq $32, %%mm0 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2160 "paddd %%mm0, %%mm4 \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2161 "movd %%mm4, %%ecx \n\t" |
2899 | 2162 "shll $2, %%ecx \n\t" |
7946 | 2163 "movl %3, %%edx \n\t" |
2164 "addl -4(%%edx), %%ecx \n\t" | |
2165 "addl 4(%%edx), %%ecx \n\t" | |
2166 "addl -1024(%%edx), %%ecx \n\t" | |
2899 | 2167 "addl $4, %%ecx \n\t" |
7946 | 2168 "addl 1024(%%edx), %%ecx \n\t" |
2899 | 2169 "shrl $3, %%ecx \n\t" |
7946 | 2170 "movl %%ecx, (%%edx) \n\t" |
2899 | 2171 |
4248 | 2172 // "movl %3, %%ecx \n\t" |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2173 // "movl %%ecx, test \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2174 // "jmp 4f \n\t" |
7946 | 2175 "cmpl 512(%%edx), %%ecx \n\t" |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2176 " jb 2f \n\t" |
7946 | 2177 "cmpl 516(%%edx), %%ecx \n\t" |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2178 " jb 1f \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2179 |
7946 | 2180 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2181 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2182 "movq (%0), %%mm0 \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2183 "movq (%0, %2), %%mm1 \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2184 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2185 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2186 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
7946 | 2187 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2188 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2189 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2190 "movq %%mm0, (%1) \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2191 "movq %%mm1, (%1, %2) \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2192 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2193 "movq %%mm3, (%1, %%eax) \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2194 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
7946 | 2195 "movq %%mm5, (%1, %%edx) \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2196 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2197 "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2198 "jmp 4f \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2199 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2200 "1: \n\t" |
7946 | 2201 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2202 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2203 "movq (%0), %%mm0 \n\t" // L0 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2204 PAVGB((%1), %%mm0) // L0 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2205 "movq (%0, %2), %%mm1 \n\t" // L1 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2206 PAVGB((%1, %2), %%mm1) // L1 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2207 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2208 PAVGB((%1, %2, 2), %%mm2) // L2 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2209 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2210 PAVGB((%1, %%eax), %%mm3) // L3 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2211 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2212 PAVGB((%1, %2, 4), %%mm4) // L4 |
7946 | 2213 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
2214 PAVGB((%1, %%edx), %%mm5) // L5 | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2215 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2216 PAVGB((%1, %%eax, 2), %%mm6) // L6 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2217 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
5980
3b078401d610
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
5787
diff
changeset
|
2218 PAVGB((%1, %%ecx), %%mm7) // L7 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2219 "movq %%mm0, (%1) \n\t" // R0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2220 "movq %%mm1, (%1, %2) \n\t" // R1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2221 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2222 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2223 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
7946 | 2224 "movq %%mm5, (%1, %%edx) \n\t" // R5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2225 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2226 "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2227 "movq %%mm0, (%0) \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2228 "movq %%mm1, (%0, %2) \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2229 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2230 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2231 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
7946 | 2232 "movq %%mm5, (%0, %%edx) \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2233 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2234 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2235 "jmp 4f \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2236 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2237 "2: \n\t" |
7946 | 2238 "cmpl 508(%%edx), %%ecx \n\t" |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2239 " jb 3f \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2240 |
7946 | 2241 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2242 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2243 "movq (%0), %%mm0 \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2244 "movq (%0, %2), %%mm1 \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2245 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2246 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2247 "movq (%1), %%mm4 \n\t" // R0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2248 "movq (%1, %2), %%mm5 \n\t" // R1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2249 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2250 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2251 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2252 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2253 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2254 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2255 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2256 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2257 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2258 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2259 "movq %%mm0, (%1) \n\t" // R0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2260 "movq %%mm1, (%1, %2) \n\t" // R1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2261 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2262 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2263 "movq %%mm0, (%0) \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2264 "movq %%mm1, (%0, %2) \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2265 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2266 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2267 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2268 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
7946 | 2269 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2270 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2271 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2272 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
7946 | 2273 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2274 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2275 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2276 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2277 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2278 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2279 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2280 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2281 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2282 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2283 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2284 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
7946 | 2285 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2286 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2287 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2288 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
7946 | 2289 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2290 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2291 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2292 "jmp 4f \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2293 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2294 "3: \n\t" |
7946 | 2295 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
2296 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2297 "movq (%0), %%mm0 \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2298 "movq (%0, %2), %%mm1 \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2299 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2300 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2301 "movq (%1), %%mm4 \n\t" // R0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2302 "movq (%1, %2), %%mm5 \n\t" // R1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2303 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2304 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2305 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2306 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2307 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2308 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2309 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2310 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2311 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2312 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2313 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2314 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2315 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2316 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2317 "movq %%mm0, (%1) \n\t" // R0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2318 "movq %%mm1, (%1, %2) \n\t" // R1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2319 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2320 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2321 "movq %%mm0, (%0) \n\t" // L0 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2322 "movq %%mm1, (%0, %2) \n\t" // L1 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2323 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2324 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2325 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2326 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
7946 | 2327 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2328 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2329 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2330 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
7946 | 2331 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2332 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2333 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2334 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2335 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2336 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2337 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2338 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2339 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2340 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2341 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2342 PAVGB(%%mm4, %%mm0) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2343 PAVGB(%%mm5, %%mm1) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2344 PAVGB(%%mm6, %%mm2) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2345 PAVGB(%%mm7, %%mm3) |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2346 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
7946 | 2347 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2348 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2349 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2350 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
7946 | 2351 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2352 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2353 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2354 |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2355 "4: \n\t" |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2356 |
2899 | 2357 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
7946 | 2358 : "%eax", "%edx", "%ecx", "memory" |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2359 ); |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2360 //printf("%d\n", test); |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2361 #else |
2860 | 2362 int y; |
2363 int d=0; | |
2364 int sysd=0; | |
2899 | 2365 int i; |
2860 | 2366 |
2367 for(y=0; y<8; y++) | |
2368 { | |
2369 int x; | |
2370 for(x=0; x<8; x++) | |
2371 { | |
2372 int ref= tempBlured[ x + y*stride ]; | |
2373 int cur= src[ x + y*stride ]; | |
2374 int d1=ref - cur; | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2375 // if(x==0 || x==7) d1+= d1>>1; |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2376 // if(y==0 || y==7) d1+= d1>>1; |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2377 // d+= ABS(d1); |
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2378 d+= d1*d1; |
2860 | 2379 sysd+= d1; |
2380 } | |
2381 } | |
2899 | 2382 i=d; |
2383 d= ( | |
2384 4*d | |
2385 +(*(tempBluredPast-256)) | |
2386 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
2387 +(*(tempBluredPast+256)) | |
2388 +4)>>3; | |
2389 *tempBluredPast=i; | |
2390 // ((*tempBluredPast)*3 + d + 2)>>2; | |
2391 | |
2860 | 2392 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
2393 /* | |
2394 Switch between | |
2395 1 0 0 0 0 0 0 (0) | |
2396 64 32 16 8 4 2 1 (1) | |
2397 64 48 36 27 20 15 11 (33) (approx) | |
2398 64 56 49 43 37 33 29 (200) (approx) | |
2399 */ | |
2400 if(d > maxNoise[1]) | |
2401 { | |
2402 if(d < maxNoise[2]) | |
2403 { | |
2404 for(y=0; y<8; y++) | |
2405 { | |
2406 int x; | |
2407 for(x=0; x<8; x++) | |
2408 { | |
2409 int ref= tempBlured[ x + y*stride ]; | |
2410 int cur= src[ x + y*stride ]; | |
2411 tempBlured[ x + y*stride ]= | |
2412 src[ x + y*stride ]= | |
2413 (ref + cur + 1)>>1; | |
2414 } | |
2415 } | |
2416 } | |
2417 else | |
2418 { | |
2419 for(y=0; y<8; y++) | |
2420 { | |
2421 int x; | |
2422 for(x=0; x<8; x++) | |
2423 { | |
2424 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
2425 } | |
2426 } | |
2427 } | |
2428 } | |
2429 else | |
2430 { | |
2431 if(d < maxNoise[0]) | |
2432 { | |
2433 for(y=0; y<8; y++) | |
2434 { | |
2435 int x; | |
2436 for(x=0; x<8; x++) | |
2437 { | |
2438 int ref= tempBlured[ x + y*stride ]; | |
2439 int cur= src[ x + y*stride ]; | |
2440 tempBlured[ x + y*stride ]= | |
2441 src[ x + y*stride ]= | |
2442 (ref*7 + cur + 4)>>3; | |
2443 } | |
2444 } | |
2445 } | |
2446 else | |
2447 { | |
2448 for(y=0; y<8; y++) | |
2449 { | |
2450 int x; | |
2451 for(x=0; x<8; x++) | |
2452 { | |
2453 int ref= tempBlured[ x + y*stride ]; | |
2454 int cur= src[ x + y*stride ]; | |
2455 tempBlured[ x + y*stride ]= | |
2456 src[ x + y*stride ]= | |
2457 (ref*3 + cur + 2)>>2; | |
2458 } | |
2459 } | |
2460 } | |
2461 } | |
2895
dd3fabd01df0
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
2860
diff
changeset
|
2462 #endif |
2860 | 2463 } |
2464 | |
3099 | 2465 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
7946 | 2466 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
2159 | 2467 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2468 /** |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2469 * Copies a block from src to dst and fixes the blacklevel |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2470 * levelFix == 0 -> dont touch the brighness & contrast |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2471 */ |
7220
e3ecccc7e505
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
6949
diff
changeset
|
2472 #undef SCALED_CPY |
e3ecccc7e505
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
6949
diff
changeset
|
2473 |
3099 | 2474 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
7946 | 2475 int levelFix, int64_t *packedOffsetAndScale) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2476 { |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
2477 #ifndef HAVE_MMX |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2478 int i; |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
2479 #endif |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2480 if(levelFix) |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2481 { |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2482 #ifdef HAVE_MMX |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2483 asm volatile( |
7946 | 2484 "movq (%%eax), %%mm2 \n\t" // packedYOffset |
2485 "movq 8(%%eax), %%mm3 \n\t" // packedYScale | |
2486 "leal (%2,%4), %%eax \n\t" | |
2487 "leal (%3,%5), %%edx \n\t" | |
2181
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2488 "pxor %%mm4, %%mm4 \n\t" |
3171 | 2489 #ifdef HAVE_MMX2 |
2490 #define SCALED_CPY(src1, src2, dst1, dst2) \ | |
2491 "movq " #src1 ", %%mm0 \n\t"\ | |
2492 "movq " #src1 ", %%mm5 \n\t"\ | |
2493 "movq " #src2 ", %%mm1 \n\t"\ | |
2494 "movq " #src2 ", %%mm6 \n\t"\ | |
2495 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
2496 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
2497 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
2498 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
2499 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
2500 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
2501 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
2502 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
2503 "psubw %%mm2, %%mm0 \n\t"\ | |
2504 "psubw %%mm2, %%mm5 \n\t"\ | |
2505 "psubw %%mm2, %%mm1 \n\t"\ | |
2506 "psubw %%mm2, %%mm6 \n\t"\ | |
2507 "packuswb %%mm5, %%mm0 \n\t"\ | |
2508 "packuswb %%mm6, %%mm1 \n\t"\ | |
2509 "movq %%mm0, " #dst1 " \n\t"\ | |
2510 "movq %%mm1, " #dst2 " \n\t"\ | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2511 |
3171 | 2512 #else //HAVE_MMX2 |
3037 | 2513 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
2514 "movq " #src1 ", %%mm0 \n\t"\ | |
2515 "movq " #src1 ", %%mm5 \n\t"\ | |
2181
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2516 "punpcklbw %%mm4, %%mm0 \n\t"\ |
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2517 "punpckhbw %%mm4, %%mm5 \n\t"\ |
2394 | 2518 "psubw %%mm2, %%mm0 \n\t"\ |
2519 "psubw %%mm2, %%mm5 \n\t"\ | |
3037 | 2520 "movq " #src2 ", %%mm1 \n\t"\ |
2394 | 2521 "psllw $6, %%mm0 \n\t"\ |
2522 "psllw $6, %%mm5 \n\t"\ | |
2181
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2523 "pmulhw %%mm3, %%mm0 \n\t"\ |
3037 | 2524 "movq " #src2 ", %%mm6 \n\t"\ |
2181
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2525 "pmulhw %%mm3, %%mm5 \n\t"\ |
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2526 "punpcklbw %%mm4, %%mm1 \n\t"\ |
2401
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2527 "punpckhbw %%mm4, %%mm6 \n\t"\ |
2394 | 2528 "psubw %%mm2, %%mm1 \n\t"\ |
2401
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2529 "psubw %%mm2, %%mm6 \n\t"\ |
2394 | 2530 "psllw $6, %%mm1 \n\t"\ |
2401
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2531 "psllw $6, %%mm6 \n\t"\ |
2181
d90f8fc7ead6
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
2180
diff
changeset
|
2532 "pmulhw %%mm3, %%mm1 \n\t"\ |
2401
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2533 "pmulhw %%mm3, %%mm6 \n\t"\ |
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2534 "packuswb %%mm5, %%mm0 \n\t"\ |
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2535 "packuswb %%mm6, %%mm1 \n\t"\ |
3037 | 2536 "movq %%mm0, " #dst1 " \n\t"\ |
2537 "movq %%mm1, " #dst2 " \n\t"\ | |
2538 | |
3171 | 2539 #endif //!HAVE_MMX2 |
2540 | |
7946 | 2541 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
2542 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) | |
2543 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) | |
2544 "leal (%%eax,%4,4), %%eax \n\t" | |
2545 "leal (%%edx,%5,4), %%edx \n\t" | |
2546 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) | |
3037 | 2547 |
2548 | |
7946 | 2549 : "=&a" (packedOffsetAndScale) |
2550 : "0" (packedOffsetAndScale), | |
2551 "r"(src), | |
3037 | 2552 "r"(dst), |
2553 "r" (srcStride), | |
2401
bc69d7c0e1dc
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
2394
diff
changeset
|
2554 "r" (dstStride) |
7946 | 2555 : "%edx" |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2556 ); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2557 #else |
3031 | 2558 for(i=0; i<8; i++) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2559 memcpy( &(dst[dstStride*i]), |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2560 &(src[srcStride*i]), BLOCK_SIZE); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2561 #endif |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2562 } |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2563 else |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2564 { |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2565 #ifdef HAVE_MMX |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2566 asm volatile( |
3037 | 2567 "leal (%0,%2), %%eax \n\t" |
7946 | 2568 "leal (%1,%3), %%edx \n\t" |
3037 | 2569 |
2570 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
2571 "movq " #src1 ", %%mm0 \n\t"\ | |
2572 "movq " #src2 ", %%mm1 \n\t"\ | |
2573 "movq %%mm0, " #dst1 " \n\t"\ | |
2574 "movq %%mm1, " #dst2 " \n\t"\ | |
2575 | |
2576 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
7946 | 2577 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
2578 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) | |
3037 | 2579 "leal (%%eax,%2,4), %%eax \n\t" |
7946 | 2580 "leal (%%edx,%3,4), %%edx \n\t" |
2581 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) | |
3037 | 2582 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2583 : : "r" (src), |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2584 "r" (dst), |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2585 "r" (srcStride), |
3031 | 2586 "r" (dstStride) |
7946 | 2587 : "%eax", "%edx" |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2588 ); |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2589 #else |
3031 | 2590 for(i=0; i<8; i++) |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2591 memcpy( &(dst[dstStride*i]), |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2592 &(src[srcStride*i]), BLOCK_SIZE); |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2593 #endif |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2594 } |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2595 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2596 |
4403 | 2597 /** |
2598 * Duplicates the given 8 src pixels ? times upward | |
2599 */ | |
2600 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
2601 { | |
2602 #ifdef HAVE_MMX | |
2603 asm volatile( | |
2604 "movq (%0), %%mm0 \n\t" | |
2605 "addl %1, %0 \n\t" | |
2606 "movq %%mm0, (%0) \n\t" | |
2607 "movq %%mm0, (%0, %1) \n\t" | |
2608 "movq %%mm0, (%0, %1, 2) \n\t" | |
2609 : "+r" (src) | |
2610 : "r" (-stride) | |
2611 ); | |
2612 #else | |
2613 int i; | |
2614 uint8_t *p=src; | |
2615 for(i=0; i<3; i++) | |
2616 { | |
2617 p-= stride; | |
2618 memcpy(p, src, 8); | |
2619 } | |
2620 #endif | |
2621 } | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2622 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2623 /** |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2624 * Filters array of bytes (Y or U or V values) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2625 */ |
3099 | 2626 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
7946 | 2627 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2628 { |
7946 | 2629 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2630 int x,y; |
3154
b2e24fec97bc
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
3099
diff
changeset
|
2631 #ifdef COMPILE_TIME_MODE |
b2e24fec97bc
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
3099
diff
changeset
|
2632 const int mode= COMPILE_TIME_MODE; |
b2e24fec97bc
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
3099
diff
changeset
|
2633 #else |
7946 | 2634 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
3154
b2e24fec97bc
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
3099
diff
changeset
|
2635 #endif |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2636 int black=0, white=255; // blackest black and whitest white in the picture |
4399 | 2637 int QPCorrecture= 256*256; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2638 |
3031 | 2639 int copyAhead; |
2640 | |
7946 | 2641 //FIXME remove |
2642 uint64_t * const yHistogram= c.yHistogram; | |
2643 uint8_t * const tempSrc= c.tempSrc; | |
2644 uint8_t * const tempDst= c.tempDst; | |
2285 | 2645 |
7946 | 2646 c.dcOffset= c.ppMode.maxDcDiff; |
2647 c.dcThreshold= c.ppMode.maxDcDiff*2 + 1; | |
3832
d05cfaf5f0f2
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
3817
diff
changeset
|
2648 |
2899 | 2649 #ifdef HAVE_MMX |
7946 | 2650 c.mmxDcOffset= 0x7F - c.dcOffset; |
2651 c.mmxDcThreshold= 0x7F - c.dcThreshold; | |
3832
d05cfaf5f0f2
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
3817
diff
changeset
|
2652 |
7946 | 2653 c.mmxDcOffset*= 0x0101010101010101LL; |
2654 c.mmxDcThreshold*= 0x0101010101010101LL; | |
2899 | 2655 #endif |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2656 |
3031 | 2657 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
7946 | 2658 else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
2659 || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14; | |
3031 | 2660 else if( (mode & V_DEBLOCK) |
2661 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
2662 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
2663 else if(mode & V_X1_FILTER) copyAhead=11; | |
7946 | 2664 // else if(mode & V_RK1_FILTER) copyAhead=10; |
3031 | 2665 else if(mode & DERING) copyAhead=9; |
2666 else copyAhead=8; | |
2667 | |
2668 copyAhead-= 8; | |
2669 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2670 if(!isColor) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2671 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2672 uint64_t sum= 0; |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2673 int i; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2674 uint64_t maxClipped; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2675 uint64_t clipped; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2676 double scale; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2677 |
7946 | 2678 c.frameNum++; |
2679 // first frame is fscked so we ignore it | |
2680 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2681 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2682 for(i=0; i<256; i++) |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2683 { |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2684 sum+= yHistogram[i]; |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2685 // printf("%d ", yHistogram[i]); |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2686 } |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2687 // printf("\n\n"); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2688 |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2689 /* we allways get a completly black picture first */ |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2690 maxClipped= (uint64_t)(sum * maxClippedThreshold); |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2691 |
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2692 clipped= sum; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2693 for(black=255; black>0; black--) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2694 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2695 if(clipped < maxClipped) break; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2696 clipped-= yHistogram[black]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2697 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2698 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2699 clipped= sum; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2700 for(white=0; white<256; white++) |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2701 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2702 if(clipped < maxClipped) break; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2703 clipped-= yHistogram[white]; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2704 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2705 |
7946 | 2706 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
3171 | 2707 |
2708 #ifdef HAVE_MMX2 | |
7946 | 2709 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
2710 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | |
3171 | 2711 #else |
7946 | 2712 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
2713 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | |
3171 | 2714 #endif |
2715 | |
7946 | 2716 c.packedYOffset|= c.packedYOffset<<32; |
2717 c.packedYOffset|= c.packedYOffset<<16; | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2718 |
7946 | 2719 c.packedYScale|= c.packedYScale<<32; |
2720 c.packedYScale|= c.packedYScale<<16; | |
4399 | 2721 |
2722 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
2723 else QPCorrecture= 256*256; | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2724 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2725 else |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2726 { |
7946 | 2727 c.packedYScale= 0x0100010001000100LL; |
2728 c.packedYOffset= 0; | |
4399 | 2729 QPCorrecture= 256*256; |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2730 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2731 |
2742
d5636499cafd
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
2595
diff
changeset
|
2732 /* copy & deinterlace first row of blocks */ |
2595 | 2733 y=-BLOCK_SIZE; |
2734 { | |
2735 uint8_t *srcBlock= &(src[y*srcStride]); | |
4403 | 2736 uint8_t *dstBlock= tempDst + dstStride; |
2595 | 2737 |
2738 // From this point on it is guranteed that we can read and write 16 lines downward | |
2739 // finish 1 block before the next otherwise we´ll might have a problem | |
2740 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
2741 for(x=0; x<width; x+=BLOCK_SIZE) | |
2742 { | |
2743 | |
2744 #ifdef HAVE_MMX2 | |
2745 /* | |
2746 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
2747 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
2748 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
2749 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
2750 */ | |
2751 | |
2752 asm( | |
2753 "movl %4, %%eax \n\t" | |
2754 "shrl $2, %%eax \n\t" | |
2755 "andl $6, %%eax \n\t" | |
3031 | 2756 "addl %5, %%eax \n\t" |
7946 | 2757 "movl %%eax, %%edx \n\t" |
2595 | 2758 "imul %1, %%eax \n\t" |
7946 | 2759 "imul %3, %%edx \n\t" |
2595 | 2760 "prefetchnta 32(%%eax, %0) \n\t" |
7946 | 2761 "prefetcht0 32(%%edx, %2) \n\t" |
2595 | 2762 "addl %1, %%eax \n\t" |
7946 | 2763 "addl %3, %%edx \n\t" |
2595 | 2764 "prefetchnta 32(%%eax, %0) \n\t" |
7946 | 2765 "prefetcht0 32(%%edx, %2) \n\t" |
2595 | 2766 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
3031 | 2767 "m" (x), "m" (copyAhead) |
7946 | 2768 : "%eax", "%edx" |
2595 | 2769 ); |
2770 | |
2771 #elif defined(HAVE_3DNOW) | |
2772 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
2773 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
2774 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
2775 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
2776 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
2777 */ | |
2778 #endif | |
2779 | |
4403 | 2780 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
7946 | 2781 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
4403 | 2782 |
2783 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
2595 | 2784 |
2785 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
3099 | 2786 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
2595 | 2787 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3099 | 2788 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
2595 | 2789 else if(mode & MEDIAN_DEINT_FILTER) |
3099 | 2790 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
2595 | 2791 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
3099 | 2792 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
7946 | 2793 else if(mode & FFMPEG_DEINT_FILTER) |
2794 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
2595 | 2795 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
3099 | 2796 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
2595 | 2797 */ |
2798 dstBlock+=8; | |
2799 srcBlock+=8; | |
2800 } | |
4403 | 2801 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride ); |
2595 | 2802 } |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2803 |
7946 | 2804 //printf("\n"); |
2246 | 2805 for(y=0; y<height; y+=BLOCK_SIZE) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2806 { |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2807 //1% speedup if these are here instead of the inner loop |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2808 uint8_t *srcBlock= &(src[y*srcStride]); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2809 uint8_t *dstBlock= &(dst[y*dstStride]); |
3099 | 2810 #ifdef HAVE_MMX |
7946 | 2811 uint8_t *tempBlock1= c.tempBlocks; |
2812 uint8_t *tempBlock2= c.tempBlocks + 8; | |
3099 | 2813 #endif |
2437 | 2814 #ifdef ARCH_X86 |
2815 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | |
4399 | 2816 int QPDelta= isColor ? (-1) : 1<<31; |
2817 int QPFrac= 1<<30; | |
2437 | 2818 #endif |
2860 | 2819 int QP=0; |
2473 | 2820 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
2821 if not than use a temporary buffer */ | |
2246 | 2822 if(y+15 >= height) |
2823 { | |
2860 | 2824 int i; |
3031 | 2825 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
2246 | 2826 blockcopy to dst later */ |
3031 | 2827 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
2828 srcStride*MAX(height-y-copyAhead, 0) ); | |
2829 | |
2830 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
2831 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
2860 | 2832 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
2833 | |
3031 | 2834 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
2835 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); | |
2836 | |
2837 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
2838 for(i=height-y+1; i<=copyAhead; i++) | |
2860 | 2839 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
2840 | |
2473 | 2841 dstBlock= tempDst + dstStride; |
2246 | 2842 srcBlock= tempSrc; |
2843 } | |
7946 | 2844 //printf("\n"); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2845 |
2285 | 2846 // From this point on it is guranteed that we can read and write 16 lines downward |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2847 // finish 1 block before the next otherwise we´ll might have a problem |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2848 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2849 for(x=0; x<width; x+=BLOCK_SIZE) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2850 { |
2168
21a8f158d19f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
2159
diff
changeset
|
2851 const int stride= dstStride; |
3099 | 2852 #ifdef HAVE_MMX |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2853 uint8_t *tmpXchg; |
3099 | 2854 #endif |
2437 | 2855 #ifdef ARCH_X86 |
2860 | 2856 QP= *QPptr; |
2437 | 2857 asm volatile( |
2858 "addl %2, %1 \n\t" | |
2859 "sbbl %%eax, %%eax \n\t" | |
2860 "shll $2, %%eax \n\t" | |
2861 "subl %%eax, %0 \n\t" | |
2862 : "+r" (QPptr), "+m" (QPFrac) | |
2863 : "r" (QPDelta) | |
2864 : "%eax" | |
2865 ); | |
2866 #else | |
2860 | 2867 QP= isColor ? |
2437 | 2868 QPs[(y>>3)*QPStride + (x>>3)]: |
2869 QPs[(y>>4)*QPStride + (x>>4)]; | |
2870 #endif | |
2871 if(!isColor) | |
2428 | 2872 { |
4399 | 2873 QP= (QP* QPCorrecture + 256*128)>>16; |
2742
d5636499cafd
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
2595
diff
changeset
|
2874 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
2428 | 2875 } |
7946 | 2876 //printf("%d ", QP); |
2877 c.QP= QP; | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2878 #ifdef HAVE_MMX |
2246 | 2879 asm volatile( |
7946 | 2880 "movd %1, %%mm7 \n\t" |
2246 | 2881 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
2882 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
2883 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
7946 | 2884 "movq %%mm7, %0 \n\t" |
2885 : "=m" (c.pQPb) | |
2886 : "r" (QP) | |
2246 | 2887 ); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2888 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2889 |
2159 | 2890 |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2891 #ifdef HAVE_MMX2 |
2437 | 2892 /* |
2893 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
2894 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
2895 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
2896 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
2897 */ | |
2898 | |
2899 asm( | |
2900 "movl %4, %%eax \n\t" | |
2901 "shrl $2, %%eax \n\t" | |
2902 "andl $6, %%eax \n\t" | |
3031 | 2903 "addl %5, %%eax \n\t" |
7946 | 2904 "movl %%eax, %%edx \n\t" |
2437 | 2905 "imul %1, %%eax \n\t" |
7946 | 2906 "imul %3, %%edx \n\t" |
2437 | 2907 "prefetchnta 32(%%eax, %0) \n\t" |
7946 | 2908 "prefetcht0 32(%%edx, %2) \n\t" |
2437 | 2909 "addl %1, %%eax \n\t" |
7946 | 2910 "addl %3, %%edx \n\t" |
2437 | 2911 "prefetchnta 32(%%eax, %0) \n\t" |
7946 | 2912 "prefetcht0 32(%%edx, %2) \n\t" |
2437 | 2913 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
3031 | 2914 "m" (x), "m" (copyAhead) |
7946 | 2915 : "%eax", "%edx" |
2437 | 2916 ); |
2917 | |
2159 | 2918 #elif defined(HAVE_3DNOW) |
2919 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
2246 | 2920 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2921 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
2922 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
2923 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
2159 | 2924 */ |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2925 #endif |
2246 | 2926 |
3099 | 2927 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
7946 | 2928 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2929 |
2246 | 2930 if(mode & LINEAR_IPOL_DEINT_FILTER) |
3099 | 2931 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
2246 | 2932 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3099 | 2933 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
2246 | 2934 else if(mode & MEDIAN_DEINT_FILTER) |
3099 | 2935 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
2246 | 2936 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
3099 | 2937 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
7946 | 2938 else if(mode & FFMPEG_DEINT_FILTER) |
2939 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
2246 | 2940 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
3099 | 2941 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
2203
f90b6e259dc8
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
2195
diff
changeset
|
2942 */ |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2943 |
2246 | 2944 /* only deblock if we have 2 blocks */ |
2945 if(y + 8 < height) | |
2946 { | |
7946 | 2947 if(mode & V_X1_FILTER) |
2948 RENAME(vertX1Filter)(dstBlock, stride, &c); | |
2300
e10f7dc4938f
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
2286
diff
changeset
|
2949 else if(mode & V_DEBLOCK) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2950 { |
7946 | 2951 if( RENAME(isVertDC)(dstBlock, stride, &c)) |
2159 | 2952 { |
7946 | 2953 if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c)) |
2954 RENAME(doVertLowPass)(dstBlock, stride, &c); | |
2159 | 2955 } |
2300
e10f7dc4938f
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
2286
diff
changeset
|
2956 else |
7946 | 2957 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2958 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2959 } |
2473 | 2960 |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2961 #ifdef HAVE_MMX |
3099 | 2962 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2963 #endif |
2246 | 2964 /* check if we have a previous block to deblock it with dstBlock */ |
2285 | 2965 if(x - 8 >= 0) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2966 { |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2967 #ifdef HAVE_MMX |
7946 | 2968 if(mode & H_X1_FILTER) |
2969 RENAME(vertX1Filter)(tempBlock1, 16, &c); | |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2970 else if(mode & H_DEBLOCK) |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2971 { |
7946 | 2972 if( RENAME(isVertDC)(tempBlock1, 16, &c)) |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2973 { |
7946 | 2974 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c)) |
2975 RENAME(doVertLowPass)(tempBlock1, 16, &c); | |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2976 } |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2977 else |
7946 | 2978 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2979 } |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2980 |
3099 | 2981 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2982 |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2983 #else |
2300
e10f7dc4938f
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
2286
diff
changeset
|
2984 if(mode & H_X1_FILTER) |
e10f7dc4938f
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
2286
diff
changeset
|
2985 horizX1Filter(dstBlock-4, stride, QP); |
e10f7dc4938f
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
2286
diff
changeset
|
2986 else if(mode & H_DEBLOCK) |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2987 { |
7946 | 2988 if( isHorizDC(dstBlock-4, stride, &c)) |
2159 | 2989 { |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2990 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2991 doHorizLowPass(dstBlock-4, stride, QP); |
2159 | 2992 } |
2300
e10f7dc4938f
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
2286
diff
changeset
|
2993 else |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2994 doHorizDefFilter(dstBlock-4, stride, QP); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2995 } |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
2996 #endif |
2473 | 2997 if(mode & DERING) |
2998 { | |
2999 //FIXME filter first line | |
7946 | 3000 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
2473 | 3001 } |
2860 | 3002 |
3003 if(mode & TEMP_NOISE_FILTER) | |
3004 { | |
3099 | 3005 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
7946 | 3006 c.tempBlured[isColor] + y*dstStride + x, |
3007 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
3008 c.ppMode.maxTmpNoise); | |
2860 | 3009 } |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3010 } |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3011 |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3012 dstBlock+=8; |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3013 srcBlock+=8; |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
3014 |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
3015 #ifdef HAVE_MMX |
2454
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
3016 tmpXchg= tempBlock1; |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
3017 tempBlock1= tempBlock2; |
b74c2a08eac9
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
2437
diff
changeset
|
3018 tempBlock2 = tmpXchg; |
2461
60f16575bece
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
2454
diff
changeset
|
3019 #endif |
2246 | 3020 } |
3021 | |
2860 | 3022 if(mode & DERING) |
3023 { | |
7946 | 3024 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
2860 | 3025 } |
3026 | |
3027 if((mode & TEMP_NOISE_FILTER)) | |
3028 { | |
3099 | 3029 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
7946 | 3030 c.tempBlured[isColor] + y*dstStride + x, |
3031 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
3032 c.ppMode.maxTmpNoise); | |
2860 | 3033 } |
3034 | |
2595 | 3035 /* did we use a tmp buffer for the last lines*/ |
2285 | 3036 if(y+15 >= height) |
2246 | 3037 { |
3038 uint8_t *dstBlock= &(dst[y*dstStride]); | |
2473 | 3039 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3040 } |
3013 | 3041 /* |
3042 for(x=0; x<width; x+=32) | |
3043 { | |
3031 | 3044 volatile int i; |
3013 | 3045 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
3046 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
3031 | 3047 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
3048 // + dstBlock[x +13*dstStride] | |
3049 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
3050 }*/ | |
3051 } | |
2159 | 3052 #ifdef HAVE_3DNOW |
3053 asm volatile("femms"); | |
3054 #elif defined (HAVE_MMX) | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3055 asm volatile("emms"); |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3056 #endif |
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3057 |
3013 | 3058 #ifdef DEBUG_BRIGHTNESS |
3059 if(!isColor) | |
3060 { | |
3061 int max=1; | |
3062 int i; | |
3063 for(i=0; i<256; i++) | |
3064 if(yHistogram[i] > max) max=yHistogram[i]; | |
3065 | |
3066 for(i=1; i<256; i++) | |
3067 { | |
3068 int x; | |
3069 int start=yHistogram[i-1]/(max/256+1); | |
3070 int end=yHistogram[i]/(max/256+1); | |
3071 int inc= end > start ? 1 : -1; | |
3072 for(x=start; x!=end+inc; x+=inc) | |
3073 dst[ i*dstStride + x]+=128; | |
3074 } | |
3075 | |
3076 for(i=0; i<100; i+=2) | |
3077 { | |
3078 dst[ (white)*dstStride + i]+=128; | |
3079 dst[ (black)*dstStride + i]+=128; | |
3080 } | |
3081 | |
3082 } | |
3083 #endif | |
3084 | |
7946 | 3085 *c2= c; //copy local context back |
3086 | |
2158
508468a75be0
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3087 } |