Mercurial > libavcodec.hg
annotate libpostproc/postprocess_template.c @ 3198:6b9f0c4fbdbe libavcodec
First part of a series of speed-enchancing patches.
This one sets up a snow.h and makes snow use the dsputil function pointer
framework to access the three functions that will be implemented in asm
in the other parts of the patchset.
Patch by Robert Edele < yartrebo AH earthlink POIS net>
Original thread:
Subject: [Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Date: Sun, 05 Feb 2006 12:47:14 -0500
author | gpoirier |
---|---|
date | Thu, 16 Mar 2006 19:18:18 +0000 |
parents | 0b546eab515d |
children |
rev | line source |
---|---|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
223 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
1109 | 19 /** |
20 * @file postprocess_template.c | |
21 * mmx/mmx2/3dnow postprocess code. | |
22 */ | |
23 | |
24 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
25 #ifdef ARCH_X86_64 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
26 # define REGa rax |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
27 # define REGc rcx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
28 # define REGd rdx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
29 # define REG_a "rax" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
30 # define REG_c "rcx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
31 # define REG_d "rdx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
32 # define REG_SP "rsp" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
33 # define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
34 #else |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
35 # define REGa eax |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
36 # define REGc ecx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
37 # define REGd edx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
38 # define REG_a "eax" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
39 # define REG_c "ecx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
40 # define REG_d "edx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
41 # define REG_SP "esp" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
42 # define ALIGN_MASK "$0xFFFFFFF8" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
43 #endif |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
44 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
45 |
169 | 46 #undef PAVGB |
47 #undef PMINUB | |
48 #undef PMAXUB | |
104 | 49 |
50 #ifdef HAVE_MMX2 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
51 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
104 | 52 #elif defined (HAVE_3DNOW) |
2295 | 53 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
104 | 54 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
55 #define PAVGB(a,b) REAL_PAVGB(a,b) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
56 |
134 | 57 #ifdef HAVE_MMX2 |
58 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
59 #elif defined (HAVE_MMX) | |
60 #define PMINUB(b,a,t) \ | |
2979 | 61 "movq " #a ", " #t " \n\t"\ |
62 "psubusb " #b ", " #t " \n\t"\ | |
63 "psubb " #t ", " #a " \n\t" | |
134 | 64 #endif |
65 | |
66 #ifdef HAVE_MMX2 | |
67 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
68 #elif defined (HAVE_MMX) | |
69 #define PMAXUB(a,b) \ | |
2979 | 70 "psubusb " #a ", " #b " \n\t"\ |
71 "paddb " #a ", " #b " \n\t" | |
134 | 72 #endif |
73 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
74 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
787 | 75 #ifdef HAVE_MMX |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
76 /** |
111 | 77 * Check if the middle 8x8 Block in the given 8x16 block is flat |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
78 */ |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
79 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
2979 | 80 int numEq= 0, dcOk; |
81 src+= stride*4; // src points to begin of the 8x8 Block | |
119 | 82 asm volatile( |
2979 | 83 "movq %0, %%mm7 \n\t" |
84 "movq %1, %%mm6 \n\t" | |
1331 | 85 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
86 ); | |
2967 | 87 |
1331 | 88 asm volatile( |
2979 | 89 "lea (%2, %3), %%"REG_a" \n\t" |
90 // 0 1 2 3 4 5 6 7 8 9 | |
91 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 | |
92 | |
93 "movq (%2), %%mm0 \n\t" | |
94 "movq (%%"REG_a"), %%mm1 \n\t" | |
95 "movq %%mm0, %%mm3 \n\t" | |
96 "movq %%mm0, %%mm4 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
97 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
98 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 99 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
100 "paddb %%mm7, %%mm0 \n\t" | |
101 "pcmpgtb %%mm6, %%mm0 \n\t" | |
102 | |
103 "movq (%%"REG_a",%3), %%mm2 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
104 PMAXUB(%%mm2, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
105 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 106 "psubb %%mm2, %%mm1 \n\t" |
107 "paddb %%mm7, %%mm1 \n\t" | |
108 "pcmpgtb %%mm6, %%mm1 \n\t" | |
109 "paddb %%mm1, %%mm0 \n\t" | |
110 | |
111 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
112 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
113 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 114 "psubb %%mm1, %%mm2 \n\t" |
115 "paddb %%mm7, %%mm2 \n\t" | |
116 "pcmpgtb %%mm6, %%mm2 \n\t" | |
117 "paddb %%mm2, %%mm0 \n\t" | |
118 | |
119 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" | |
120 | |
121 "movq (%2, %3, 4), %%mm2 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
122 PMAXUB(%%mm2, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
123 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 124 "psubb %%mm2, %%mm1 \n\t" |
125 "paddb %%mm7, %%mm1 \n\t" | |
126 "pcmpgtb %%mm6, %%mm1 \n\t" | |
127 "paddb %%mm1, %%mm0 \n\t" | |
128 | |
129 "movq (%%"REG_a"), %%mm1 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
130 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
131 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 132 "psubb %%mm1, %%mm2 \n\t" |
133 "paddb %%mm7, %%mm2 \n\t" | |
134 "pcmpgtb %%mm6, %%mm2 \n\t" | |
135 "paddb %%mm2, %%mm0 \n\t" | |
136 | |
137 "movq (%%"REG_a", %3), %%mm2 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
138 PMAXUB(%%mm2, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
139 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 140 "psubb %%mm2, %%mm1 \n\t" |
141 "paddb %%mm7, %%mm1 \n\t" | |
142 "pcmpgtb %%mm6, %%mm1 \n\t" | |
143 "paddb %%mm1, %%mm0 \n\t" | |
144 | |
145 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
146 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
147 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 148 "psubb %%mm1, %%mm2 \n\t" |
149 "paddb %%mm7, %%mm2 \n\t" | |
150 "pcmpgtb %%mm6, %%mm2 \n\t" | |
151 "paddb %%mm2, %%mm0 \n\t" | |
152 "psubusb %%mm3, %%mm4 \n\t" | |
153 | |
154 " \n\t" | |
167 | 155 #ifdef HAVE_MMX2 |
2979 | 156 "pxor %%mm7, %%mm7 \n\t" |
157 "psadbw %%mm7, %%mm0 \n\t" | |
167 | 158 #else |
2979 | 159 "movq %%mm0, %%mm1 \n\t" |
160 "psrlw $8, %%mm0 \n\t" | |
161 "paddb %%mm1, %%mm0 \n\t" | |
162 "movq %%mm0, %%mm1 \n\t" | |
163 "psrlq $16, %%mm0 \n\t" | |
164 "paddb %%mm1, %%mm0 \n\t" | |
165 "movq %%mm0, %%mm1 \n\t" | |
166 "psrlq $32, %%mm0 \n\t" | |
167 "paddb %%mm1, %%mm0 \n\t" | |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
168 #endif |
2979 | 169 "movq %4, %%mm7 \n\t" // QP,..., QP |
170 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
171 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 | |
172 "packssdw %%mm4, %%mm4 \n\t" | |
173 "movd %%mm0, %0 \n\t" | |
174 "movd %%mm4, %1 \n\t" | |
175 | |
176 : "=r" (numEq), "=r" (dcOk) | |
177 : "r" (src), "r" ((long)stride), "m" (c->pQPb) | |
178 : "%"REG_a | |
179 ); | |
180 | |
181 numEq= (-numEq) &0xFF; | |
182 if(numEq > c->ppMode.flatnessThreshold){ | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
183 if(dcOk) return 0; |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
184 else return 1; |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
185 }else{ |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
186 return 2; |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
187 } |
787 | 188 } |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
189 #endif //HAVE_MMX |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
190 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
191 /** |
111 | 192 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
107 | 193 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
194 */ |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
195 #ifndef HAVE_ALTIVEC |
787 | 196 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
197 { |
96 | 198 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 199 src+= stride*3; |
200 asm volatile( //"movv %0 %1 %2\n\t" | |
201 "movq %2, %%mm0 \n\t" // QP,..., QP | |
202 "pxor %%mm4, %%mm4 \n\t" | |
203 | |
204 "movq (%0), %%mm6 \n\t" | |
205 "movq (%0, %1), %%mm5 \n\t" | |
206 "movq %%mm5, %%mm1 \n\t" | |
207 "movq %%mm6, %%mm2 \n\t" | |
208 "psubusb %%mm6, %%mm5 \n\t" | |
209 "psubusb %%mm1, %%mm2 \n\t" | |
210 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
211 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | |
212 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF | |
213 | |
214 "pand %%mm2, %%mm6 \n\t" | |
215 "pandn %%mm1, %%mm2 \n\t" | |
216 "por %%mm2, %%mm6 \n\t"// First Line to Filter | |
217 | |
218 "movq (%0, %1, 8), %%mm5 \n\t" | |
219 "lea (%0, %1, 4), %%"REG_a" \n\t" | |
220 "lea (%0, %1, 8), %%"REG_c" \n\t" | |
221 "sub %1, %%"REG_c" \n\t" | |
222 "add %1, %0 \n\t" // %0 points to line 1 not 0 | |
223 "movq (%0, %1, 8), %%mm7 \n\t" | |
224 "movq %%mm5, %%mm1 \n\t" | |
225 "movq %%mm7, %%mm2 \n\t" | |
226 "psubusb %%mm7, %%mm5 \n\t" | |
227 "psubusb %%mm1, %%mm2 \n\t" | |
228 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
229 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | |
230 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF | |
231 | |
232 "pand %%mm2, %%mm7 \n\t" | |
233 "pandn %%mm1, %%mm2 \n\t" | |
234 "por %%mm2, %%mm7 \n\t" // First Line to Filter | |
235 | |
236 | |
237 // 1 2 3 4 5 6 7 8 | |
238 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 | |
239 // 6 4 2 2 1 1 | |
240 // 6 4 4 2 | |
241 // 6 8 2 | |
242 | |
243 "movq (%0, %1), %%mm0 \n\t" // 1 | |
244 "movq %%mm0, %%mm1 \n\t" // 1 | |
245 PAVGB(%%mm6, %%mm0) //1 1 /2 | |
246 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
247 | |
248 "movq (%0, %1, 4), %%mm2 \n\t" // 1 | |
249 "movq %%mm2, %%mm5 \n\t" // 1 | |
250 PAVGB((%%REGa), %%mm2) // 11 /2 | |
251 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
252 "movq %%mm2, %%mm3 \n\t" // 211 /4 | |
253 "movq (%0), %%mm4 \n\t" // 1 | |
254 PAVGB(%%mm4, %%mm3) // 4 211 /8 | |
255 PAVGB(%%mm0, %%mm3) //642211 /16 | |
256 "movq %%mm3, (%0) \n\t" // X | |
257 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 | |
258 "movq %%mm1, %%mm0 \n\t" // 1 | |
259 PAVGB(%%mm6, %%mm0) //1 1 /2 | |
260 "movq %%mm4, %%mm3 \n\t" // 1 | |
261 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 | |
262 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 | |
263 PAVGB((%%REGa), %%mm5) // 211 /4 | |
264 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
265 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
266 "movq %%mm3, (%0,%1) \n\t" // X | |
267 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | |
268 PAVGB(%%mm4, %%mm6) //11 /2 | |
269 "movq (%%"REG_c"), %%mm0 \n\t" // 1 | |
270 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 | |
271 "movq %%mm0, %%mm3 \n\t" // 11/2 | |
272 PAVGB(%%mm1, %%mm0) // 2 11/4 | |
273 PAVGB(%%mm6, %%mm0) //222 11/8 | |
274 PAVGB(%%mm2, %%mm0) //22242211/16 | |
275 "movq (%0, %1, 2), %%mm2 \n\t" // 1 | |
276 "movq %%mm0, (%0, %1, 2) \n\t" // X | |
277 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | |
278 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 | |
279 PAVGB((%%REGc), %%mm0) // 11 /2 | |
280 PAVGB(%%mm0, %%mm6) //11 11 /4 | |
281 PAVGB(%%mm1, %%mm4) // 11 /2 | |
282 PAVGB(%%mm2, %%mm1) // 11 /2 | |
283 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
284 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
285 "movq (%%"REG_a"), %%mm5 \n\t" // 1 | |
286 "movq %%mm6, (%%"REG_a") \n\t" // X | |
287 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 | |
288 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 | |
289 PAVGB(%%mm7, %%mm6) // 11 /2 | |
290 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
291 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
292 PAVGB(%%mm5, %%mm2) // 11 /2 | |
293 "movq (%0, %1, 4), %%mm4 \n\t" // 1 | |
294 PAVGB(%%mm4, %%mm2) // 112 /4 | |
295 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
296 "movq %%mm6, (%0, %1, 4) \n\t" // X | |
297 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 | |
298 PAVGB(%%mm7, %%mm1) // 11 2 /4 | |
299 PAVGB(%%mm4, %%mm5) // 11 /2 | |
300 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
301 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 | |
302 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 | |
303 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
304 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X | |
305 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | |
306 PAVGB((%%REGc), %%mm2) // 112 4 /8 | |
307 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 | |
308 PAVGB(%%mm0, %%mm6) // 1 1 /2 | |
309 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
310 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
311 "movq %%mm6, (%%"REG_c") \n\t" // X | |
312 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | |
313 PAVGB(%%mm7, %%mm5) // 11 2 /4 | |
314 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
315 | |
316 PAVGB(%%mm3, %%mm0) // 112 /4 | |
317 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
318 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X | |
319 "sub %1, %0 \n\t" | |
320 | |
321 : | |
322 : "r" (src), "r" ((long)stride), "m" (c->pQPb) | |
323 : "%"REG_a, "%"REG_c | |
324 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
325 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 326 const int l1= stride; |
327 const int l2= stride + l1; | |
328 const int l3= stride + l2; | |
329 const int l4= stride + l3; | |
330 const int l5= stride + l4; | |
331 const int l6= stride + l5; | |
332 const int l7= stride + l6; | |
333 const int l8= stride + l7; | |
334 const int l9= stride + l8; | |
335 int x; | |
336 src+= stride*3; | |
337 for(x=0; x<BLOCK_SIZE; x++) | |
338 { | |
339 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; | |
340 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; | |
341 | |
342 int sums[10]; | |
343 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; | |
344 sums[1] = sums[0] - first + src[l4]; | |
345 sums[2] = sums[1] - first + src[l5]; | |
346 sums[3] = sums[2] - first + src[l6]; | |
347 sums[4] = sums[3] - first + src[l7]; | |
348 sums[5] = sums[4] - src[l1] + src[l8]; | |
349 sums[6] = sums[5] - src[l2] + last; | |
350 sums[7] = sums[6] - src[l3] + last; | |
351 sums[8] = sums[7] - src[l4] + last; | |
352 sums[9] = sums[8] - src[l5] + last; | |
353 | |
354 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; | |
355 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; | |
356 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; | |
357 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; | |
358 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; | |
359 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; | |
360 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; | |
361 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; | |
362 | |
363 src++; | |
364 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
365 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
366 } |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
367 #endif //HAVE_ALTIVEC |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
368 |
787 | 369 #if 0 |
96 | 370 /** |
371 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
372 * values are correctly clipped (MMX2) | |
373 * values are wraparound (C) | |
374 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
2979 | 375 0 8 16 24 |
376 x = 8 | |
377 x/2 = 4 | |
378 x/8 = 1 | |
379 1 12 12 23 | |
96 | 380 */ |
169 | 381 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
96 | 382 { |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
383 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 384 src+= stride*3; |
96 | 385 // FIXME rounding |
2979 | 386 asm volatile( |
387 "pxor %%mm7, %%mm7 \n\t" // 0 | |
388 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE | |
389 "leal (%0, %1), %%"REG_a" \n\t" | |
390 "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" | |
391 // 0 1 2 3 4 5 6 7 8 9 | |
392 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 | |
393 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP | |
394 "movq %%mm0, %%mm1 \n\t" // QP,..., QP | |
395 "paddusb "MANGLE(b02)", %%mm0 \n\t" | |
396 "psrlw $2, %%mm0 \n\t" | |
397 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 | |
398 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... | |
399 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
400 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 | |
401 "movq %%mm2, %%mm4 \n\t" // line 4 | |
402 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
403 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
404 PAVGB(%%mm3, %%mm5) | |
405 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 | |
406 "psubusb %%mm3, %%mm4 \n\t" | |
407 "psubusb %%mm2, %%mm3 \n\t" | |
408 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
409 "psubusb %%mm0, %%mm4 \n\t" | |
410 "pcmpeqb %%mm7, %%mm4 \n\t" | |
411 "pand %%mm4, %%mm5 \n\t" // d/2 | |
412 | |
413 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
414 "paddb %%mm5, %%mm2 \n\t" | |
415 // "psubb %%mm6, %%mm2 \n\t" | |
416 "movq %%mm2, (%0,%1, 4) \n\t" | |
417 | |
418 "movq (%%"REG_c"), %%mm2 \n\t" | |
419 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 | |
420 "psubb %%mm5, %%mm2 \n\t" | |
421 // "psubb %%mm6, %%mm2 \n\t" | |
422 "movq %%mm2, (%%"REG_c") \n\t" | |
423 | |
424 "paddb %%mm6, %%mm5 \n\t" | |
425 "psrlw $2, %%mm5 \n\t" | |
426 "pand "MANGLE(b3F)", %%mm5 \n\t" | |
427 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
428 | |
429 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" | |
430 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
431 "paddsb %%mm5, %%mm2 \n\t" | |
432 "psubb %%mm6, %%mm2 \n\t" | |
433 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" | |
434 | |
435 "movq (%%"REG_c", %1), %%mm2 \n\t" | |
436 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 | |
437 "psubsb %%mm5, %%mm2 \n\t" | |
438 "psubb %%mm6, %%mm2 \n\t" | |
439 "movq %%mm2, (%%"REG_c", %1) \n\t" | |
440 | |
441 : | |
442 : "r" (src), "r" ((long)stride) | |
443 : "%"REG_a, "%"REG_c | |
444 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
445 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 446 const int l1= stride; |
447 const int l2= stride + l1; | |
448 const int l3= stride + l2; | |
449 const int l4= stride + l3; | |
450 const int l5= stride + l4; | |
451 const int l6= stride + l5; | |
452 // const int l7= stride + l6; | |
453 // const int l8= stride + l7; | |
454 // const int l9= stride + l8; | |
455 int x; | |
456 const int QP15= QP + (QP>>2); | |
457 src+= stride*3; | |
458 for(x=0; x<BLOCK_SIZE; x++) | |
459 { | |
460 const int v = (src[x+l5] - src[x+l4]); | |
461 if(ABS(v) < QP15) | |
462 { | |
463 src[x+l3] +=v>>3; | |
464 src[x+l4] +=v>>1; | |
465 src[x+l5] -=v>>1; | |
466 src[x+l6] -=v>>3; | |
467 | |
468 } | |
469 } | |
96 | 470 |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
471 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
96 | 472 } |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
473 #endif //0 |
96 | 474 |
475 /** | |
476 * Experimental Filter 1 | |
99 | 477 * will not damage linear gradients |
478 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
479 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
480 * MMX2 version does correct clipping C version doesnt |
96 | 481 */ |
787 | 482 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
96 | 483 { |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
484 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 485 src+= stride*3; |
486 | |
487 asm volatile( | |
488 "pxor %%mm7, %%mm7 \n\t" // 0 | |
489 "lea (%0, %1), %%"REG_a" \n\t" | |
490 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" | |
491 // 0 1 2 3 4 5 6 7 8 9 | |
492 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 | |
493 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 | |
494 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 | |
495 "movq %%mm1, %%mm2 \n\t" // line 4 | |
496 "psubusb %%mm0, %%mm1 \n\t" | |
497 "psubusb %%mm2, %%mm0 \n\t" | |
498 "por %%mm1, %%mm0 \n\t" // |l2 - l3| | |
499 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 | |
500 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 | |
501 "movq %%mm3, %%mm5 \n\t" // line 5 | |
502 "psubusb %%mm4, %%mm3 \n\t" | |
503 "psubusb %%mm5, %%mm4 \n\t" | |
504 "por %%mm4, %%mm3 \n\t" // |l5 - l6| | |
505 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 | |
506 "movq %%mm2, %%mm1 \n\t" // line 4 | |
507 "psubusb %%mm5, %%mm2 \n\t" | |
508 "movq %%mm2, %%mm4 \n\t" | |
509 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 | |
510 "psubusb %%mm1, %%mm5 \n\t" | |
511 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | |
512 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | |
513 "movq %%mm4, %%mm3 \n\t" // d | |
514 "movq %2, %%mm0 \n\t" | |
515 "paddusb %%mm0, %%mm0 \n\t" | |
516 "psubusb %%mm0, %%mm4 \n\t" | |
517 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | |
518 "psubusb "MANGLE(b01)", %%mm3 \n\t" | |
519 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | |
520 | |
521 PAVGB(%%mm7, %%mm3) // d/2 | |
522 "movq %%mm3, %%mm1 \n\t" // d/2 | |
523 PAVGB(%%mm7, %%mm3) // d/4 | |
524 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
525 | |
526 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 | |
527 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | |
528 "psubusb %%mm3, %%mm0 \n\t" | |
529 "pxor %%mm2, %%mm0 \n\t" | |
530 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 | |
531 | |
532 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 | |
533 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | |
534 "paddusb %%mm3, %%mm0 \n\t" | |
535 "pxor %%mm2, %%mm0 \n\t" | |
536 "movq %%mm0, (%%"REG_c") \n\t" // line 5 | |
537 | |
538 PAVGB(%%mm7, %%mm1) // d/4 | |
539 | |
540 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 | |
541 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | |
542 "psubusb %%mm1, %%mm0 \n\t" | |
543 "pxor %%mm2, %%mm0 \n\t" | |
544 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 | |
545 | |
546 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 | |
547 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | |
548 "paddusb %%mm1, %%mm0 \n\t" | |
549 "pxor %%mm2, %%mm0 \n\t" | |
550 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 | |
551 | |
552 PAVGB(%%mm7, %%mm1) // d/8 | |
553 | |
554 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 | |
555 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | |
556 "psubusb %%mm1, %%mm0 \n\t" | |
557 "pxor %%mm2, %%mm0 \n\t" | |
558 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 | |
559 | |
560 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 | |
561 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | |
562 "paddusb %%mm1, %%mm0 \n\t" | |
563 "pxor %%mm2, %%mm0 \n\t" | |
564 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 | |
565 | |
566 : | |
567 : "r" (src), "r" ((long)stride), "m" (co->pQPb) | |
568 : "%"REG_a, "%"REG_c | |
569 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
570 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
571 |
2979 | 572 const int l1= stride; |
573 const int l2= stride + l1; | |
574 const int l3= stride + l2; | |
575 const int l4= stride + l3; | |
576 const int l5= stride + l4; | |
577 const int l6= stride + l5; | |
578 const int l7= stride + l6; | |
579 // const int l8= stride + l7; | |
580 // const int l9= stride + l8; | |
581 int x; | |
582 | |
583 src+= stride*3; | |
584 for(x=0; x<BLOCK_SIZE; x++) | |
585 { | |
586 int a= src[l3] - src[l4]; | |
587 int b= src[l4] - src[l5]; | |
588 int c= src[l5] - src[l6]; | |
589 | |
590 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); | |
591 d= MAX(d, 0); | |
592 | |
593 if(d < co->QP*2) | |
594 { | |
595 int v = d * SIGN(-b); | |
596 | |
597 src[l2] +=v>>3; | |
598 src[l3] +=v>>2; | |
599 src[l4] +=(3*v)>>3; | |
600 src[l5] -=(3*v)>>3; | |
601 src[l6] -=v>>2; | |
602 src[l7] -=v>>3; | |
603 | |
604 } | |
605 src++; | |
606 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
607 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
96 | 608 } |
609 | |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
610 #ifndef HAVE_ALTIVEC |
787 | 611 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
612 { |
163 | 613 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
614 /* | |
2979 | 615 uint8_t tmp[16]; |
616 const int l1= stride; | |
617 const int l2= stride + l1; | |
618 const int l3= stride + l2; | |
619 const int l4= (int)tmp - (int)src - stride*3; | |
620 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
621 const int l6= stride*3 + l3; | |
622 const int l7= stride + l6; | |
623 const int l8= stride + l7; | |
624 | |
625 memcpy(tmp, src+stride*7, 8); | |
626 memcpy(tmp+8, src+stride*8, 8); | |
163 | 627 */ |
2979 | 628 src+= stride*4; |
629 asm volatile( | |
163 | 630 |
631 #if 0 //sligtly more accurate and slightly slower | |
2979 | 632 "pxor %%mm7, %%mm7 \n\t" // 0 |
633 "lea (%0, %1), %%"REG_a" \n\t" | |
634 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" | |
635 // 0 1 2 3 4 5 6 7 | |
636 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 | |
637 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
638 | |
639 | |
640 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
641 "movq (%0), %%mm1 \n\t" // l0 | |
642 "movq %%mm0, %%mm2 \n\t" // l2 | |
643 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
644 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
645 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
646 | |
647 "movq (%%"REG_a"), %%mm1 \n\t" // l1 | |
648 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 | |
649 "movq %%mm1, %%mm4 \n\t" // l1 | |
650 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
651 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
652 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
653 | |
654 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
655 "psubusb %%mm1, %%mm0 \n\t" | |
656 "psubusb %%mm4, %%mm1 \n\t" | |
657 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
163 | 658 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 |
659 | |
2979 | 660 "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
661 "movq %%mm0, %%mm4 \n\t" // l4 | |
662 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
663 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
664 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
665 | |
666 "movq (%%"REG_c"), %%mm2 \n\t" // l5 | |
667 "movq %%mm3, %%mm5 \n\t" // l3 | |
668 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
669 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
670 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
671 | |
672 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
673 "psubusb %%mm3, %%mm0 \n\t" | |
674 "psubusb %%mm6, %%mm3 \n\t" | |
675 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
676 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
163 | 677 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 |
678 | |
2979 | 679 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 |
680 "movq %%mm6, %%mm5 \n\t" // l6 | |
681 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
682 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
683 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
684 | |
685 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 | |
686 "movq %%mm2, %%mm4 \n\t" // l5 | |
687 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
688 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
689 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
690 | |
691 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
692 "psubusb %%mm2, %%mm6 \n\t" | |
693 "psubusb %%mm4, %%mm2 \n\t" | |
694 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
163 | 695 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 |
696 | |
697 | |
2979 | 698 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 |
699 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? | |
700 "paddusb "MANGLE(b01)", %%mm4 \n\t" | |
701 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | |
702 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
703 "pand %%mm4, %%mm3 \n\t" | |
704 | |
705 "movq %%mm3, %%mm1 \n\t" | |
706 // "psubusb "MANGLE(b01)", %%mm3 \n\t" | |
707 PAVGB(%%mm7, %%mm3) | |
708 PAVGB(%%mm7, %%mm3) | |
709 "paddusb %%mm1, %%mm3 \n\t" | |
710 // "paddusb "MANGLE(b01)", %%mm3 \n\t" | |
711 | |
712 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 | |
713 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
714 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
715 "psubusb %%mm6, %%mm5 \n\t" | |
716 "psubusb %%mm4, %%mm6 \n\t" | |
717 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
718 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
719 "pxor %%mm6, %%mm0 \n\t" | |
720 "pand %%mm0, %%mm3 \n\t" | |
721 PMINUB(%%mm5, %%mm3, %%mm0) | |
722 | |
723 "psubusb "MANGLE(b01)", %%mm3 \n\t" | |
724 PAVGB(%%mm7, %%mm3) | |
725 | |
726 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" | |
727 "movq (%0, %1, 4), %%mm2 \n\t" | |
728 "pxor %%mm6, %%mm0 \n\t" | |
729 "pxor %%mm6, %%mm2 \n\t" | |
730 "psubb %%mm3, %%mm0 \n\t" | |
731 "paddb %%mm3, %%mm2 \n\t" | |
732 "pxor %%mm6, %%mm0 \n\t" | |
733 "pxor %%mm6, %%mm2 \n\t" | |
734 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" | |
735 "movq %%mm2, (%0, %1, 4) \n\t" | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
736 #endif //0 |
163 | 737 |
2979 | 738 "lea (%0, %1), %%"REG_a" \n\t" |
739 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
740 // 0 1 2 3 4 5 6 7 | |
741 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 | |
742 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
743 | |
744 | |
745 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 | |
746 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
747 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
748 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
163 | 749 // mm1=-l3-1, mm0=128-q |
750 | |
2979 | 751 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 |
752 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 | |
753 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
754 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
755 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 | |
756 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" | |
757 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | |
758 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
759 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
760 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
163 | 761 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 |
762 | |
2979 | 763 "movq (%%"REG_a"), %%mm2 \n\t" // l1 |
764 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
765 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
766 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
767 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 | |
768 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | |
769 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
770 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
163 | 771 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 |
772 | |
2979 | 773 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 |
774 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 | |
775 "pxor %%mm6, %%mm1 \n\t" // -l7-1 | |
776 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
777 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 | |
778 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | |
779 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
780 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
163 | 781 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 |
782 | |
2979 | 783 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
784 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
785 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 | |
786 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
787 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
788 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
789 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
163 | 790 |
791 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
792 | |
2979 | 793 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
794 "movq %2, %%mm2 \n\t" // QP | |
795 PAVGB(%%mm6, %%mm2) // 128 + QP/2 | |
796 "psubb %%mm6, %%mm2 \n\t" | |
797 | |
798 "movq %%mm4, %%mm1 \n\t" | |
799 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
800 "pxor %%mm1, %%mm4 \n\t" | |
801 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
802 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
803 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
163 | 804 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 |
805 | |
2979 | 806 "movq %%mm4, %%mm3 \n\t" // d |
807 "psubusb "MANGLE(b01)", %%mm4 \n\t" | |
808 PAVGB(%%mm7, %%mm4) // d/32 | |
809 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
810 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
811 "pand %%mm2, %%mm4 \n\t" | |
812 | |
813 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 | |
814 "psubb %%mm0, %%mm5 \n\t" // q | |
815 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
816 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
817 "pxor %%mm7, %%mm5 \n\t" | |
818 | |
819 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
820 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
821 | |
822 "pand %%mm7, %%mm4 \n\t" | |
823 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" | |
824 "movq (%0, %1, 4), %%mm2 \n\t" | |
825 "pxor %%mm1, %%mm0 \n\t" | |
826 "pxor %%mm1, %%mm2 \n\t" | |
827 "paddb %%mm4, %%mm0 \n\t" | |
828 "psubb %%mm4, %%mm2 \n\t" | |
829 "pxor %%mm1, %%mm0 \n\t" | |
830 "pxor %%mm1, %%mm2 \n\t" | |
831 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" | |
832 "movq %%mm2, (%0, %1, 4) \n\t" | |
833 | |
834 : | |
835 : "r" (src), "r" ((long)stride), "m" (c->pQPb) | |
836 : "%"REG_a, "%"REG_c | |
837 ); | |
163 | 838 |
839 /* | |
2979 | 840 { |
841 int x; | |
842 src-= stride; | |
843 for(x=0; x<BLOCK_SIZE; x++) | |
844 { | |
845 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
846 if(ABS(middleEnergy)< 8*QP) | |
847 { | |
848 const int q=(src[l4] - src[l5])/2; | |
849 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
850 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
851 | |
852 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
853 d= MAX(d, 0); | |
854 | |
855 d= (5*d + 32) >> 6; | |
856 d*= SIGN(-middleEnergy); | |
857 | |
858 if(q>0) | |
859 { | |
860 d= d<0 ? 0 : d; | |
861 d= d>q ? q : d; | |
862 } | |
863 else | |
864 { | |
865 d= d>0 ? 0 : d; | |
866 d= d<q ? q : d; | |
867 } | |
868 | |
869 src[l4]-= d; | |
870 src[l5]+= d; | |
871 } | |
872 src++; | |
873 } | |
163 | 874 src-=8; |
2979 | 875 for(x=0; x<8; x++) |
876 { | |
877 int y; | |
878 for(y=4; y<6; y++) | |
879 { | |
880 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
881 int ad= ABS(d); | |
882 static int max=0; | |
883 static int sum=0; | |
884 static int num=0; | |
885 static int bias=0; | |
886 | |
887 if(max<ad) max=ad; | |
888 sum+= ad>3 ? 1 : 0; | |
889 if(ad>3) | |
890 { | |
891 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
892 } | |
893 if(y==4) bias+=d; | |
894 num++; | |
895 if(num%1000000 == 0) | |
896 { | |
897 printf(" %d %d %d %d\n", num, sum, max, bias); | |
898 } | |
899 } | |
900 } | |
163 | 901 } |
902 */ | |
903 #elif defined (HAVE_MMX) | |
2979 | 904 src+= stride*4; |
905 asm volatile( | |
906 "pxor %%mm7, %%mm7 \n\t" | |
907 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars | |
908 "and "ALIGN_MASK", %%"REG_c" \n\t" // align | |
909 // 0 1 2 3 4 5 6 7 | |
910 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 | |
911 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | |
912 | |
913 "movq (%0), %%mm0 \n\t" | |
914 "movq %%mm0, %%mm1 \n\t" | |
915 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | |
916 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | |
917 | |
918 "movq (%0, %1), %%mm2 \n\t" | |
919 "lea (%0, %1, 2), %%"REG_a" \n\t" | |
920 "movq %%mm2, %%mm3 \n\t" | |
921 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | |
922 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | |
923 | |
924 "movq (%%"REG_a"), %%mm4 \n\t" | |
925 "movq %%mm4, %%mm5 \n\t" | |
926 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | |
927 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | |
928 | |
929 "paddw %%mm0, %%mm0 \n\t" // 2L0 | |
930 "paddw %%mm1, %%mm1 \n\t" // 2H0 | |
931 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 | |
932 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 | |
933 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 | |
934 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 | |
935 | |
936 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | |
937 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | |
938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | |
939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | |
940 | |
941 "movq (%%"REG_a", %1), %%mm2 \n\t" | |
942 "movq %%mm2, %%mm3 \n\t" | |
943 "punpcklbw %%mm7, %%mm2 \n\t" // L3 | |
944 "punpckhbw %%mm7, %%mm3 \n\t" // H3 | |
945 | |
946 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | |
947 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | |
948 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
949 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
950 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
951 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
952 | |
953 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" | |
954 "movq %%mm0, %%mm1 \n\t" | |
955 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | |
956 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | |
957 | |
958 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | |
959 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | |
960 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 | |
961 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 | |
962 "paddw %%mm4, %%mm4 \n\t" // 2L2 | |
963 "paddw %%mm5, %%mm5 \n\t" // 2H2 | |
964 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | |
965 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | |
966 | |
967 "lea (%%"REG_a", %1), %0 \n\t" | |
968 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | |
969 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | |
970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | |
971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 //50 opcodes so far |
2979 | 973 "movq (%0, %1, 2), %%mm2 \n\t" |
974 "movq %%mm2, %%mm3 \n\t" | |
975 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | |
976 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | |
977 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | |
978 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | |
979 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | |
980 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | |
981 | |
982 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" | |
983 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | |
984 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | |
985 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" | |
986 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | |
987 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | |
988 | |
989 "paddw %%mm0, %%mm0 \n\t" // 2L4 | |
990 "paddw %%mm1, %%mm1 \n\t" // 2H4 | |
991 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 | |
992 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 | |
993 | |
994 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | |
995 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | |
996 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | |
997 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | |
998 | |
999 "movq (%0, %1, 4), %%mm2 \n\t" | |
1000 "movq %%mm2, %%mm3 \n\t" | |
1001 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | |
1002 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | |
1003 | |
1004 "paddw %%mm2, %%mm2 \n\t" // 2L7 | |
1005 "paddw %%mm3, %%mm3 \n\t" // 2H7 | |
1006 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | |
1007 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | |
1008 | |
1009 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
1010 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
140 | 1011 |
1012 #ifdef HAVE_MMX2 | |
2979 | 1013 "movq %%mm7, %%mm6 \n\t" // 0 |
1014 "psubw %%mm0, %%mm6 \n\t" | |
1015 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
1016 "movq %%mm7, %%mm6 \n\t" // 0 | |
1017 "psubw %%mm1, %%mm6 \n\t" | |
1018 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
1019 "movq %%mm7, %%mm6 \n\t" // 0 | |
1020 "psubw %%mm2, %%mm6 \n\t" | |
1021 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
1022 "movq %%mm7, %%mm6 \n\t" // 0 | |
1023 "psubw %%mm3, %%mm6 \n\t" | |
1024 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
140 | 1025 #else |
2979 | 1026 "movq %%mm7, %%mm6 \n\t" // 0 |
1027 "pcmpgtw %%mm0, %%mm6 \n\t" | |
1028 "pxor %%mm6, %%mm0 \n\t" | |
1029 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
1030 "movq %%mm7, %%mm6 \n\t" // 0 | |
1031 "pcmpgtw %%mm1, %%mm6 \n\t" | |
1032 "pxor %%mm6, %%mm1 \n\t" | |
1033 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
1034 "movq %%mm7, %%mm6 \n\t" // 0 | |
1035 "pcmpgtw %%mm2, %%mm6 \n\t" | |
1036 "pxor %%mm6, %%mm2 \n\t" | |
1037 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
1038 "movq %%mm7, %%mm6 \n\t" // 0 | |
1039 "pcmpgtw %%mm3, %%mm6 \n\t" | |
1040 "pxor %%mm6, %%mm3 \n\t" | |
1041 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
140 | 1042 #endif |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 #ifdef HAVE_MMX2 |
2979 | 1045 "pminsw %%mm2, %%mm0 \n\t" |
1046 "pminsw %%mm3, %%mm1 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 #else |
2979 | 1048 "movq %%mm0, %%mm6 \n\t" |
1049 "psubusw %%mm2, %%mm6 \n\t" | |
1050 "psubw %%mm6, %%mm0 \n\t" | |
1051 "movq %%mm1, %%mm6 \n\t" | |
1052 "psubusw %%mm3, %%mm6 \n\t" | |
1053 "psubw %%mm6, %%mm1 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1054 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1055 |
2979 | 1056 "movd %2, %%mm2 \n\t" // QP |
1057 "punpcklbw %%mm7, %%mm2 \n\t" | |
1058 | |
1059 "movq %%mm7, %%mm6 \n\t" // 0 | |
1060 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | |
1061 "pxor %%mm6, %%mm4 \n\t" | |
1062 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | |
1063 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | |
1064 "pxor %%mm7, %%mm5 \n\t" | |
1065 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1066 // 100 opcodes |
2979 | 1067 "psllw $3, %%mm2 \n\t" // 8QP |
1068 "movq %%mm2, %%mm3 \n\t" // 8QP | |
1069 "pcmpgtw %%mm4, %%mm2 \n\t" | |
1070 "pcmpgtw %%mm5, %%mm3 \n\t" | |
1071 "pand %%mm2, %%mm4 \n\t" | |
1072 "pand %%mm3, %%mm5 \n\t" | |
1073 | |
1074 | |
1075 "psubusw %%mm0, %%mm4 \n\t" // hd | |
1076 "psubusw %%mm1, %%mm5 \n\t" // ld | |
1077 | |
1078 | |
1079 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 | |
1080 "pmullw %%mm2, %%mm4 \n\t" | |
1081 "pmullw %%mm2, %%mm5 \n\t" | |
1082 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 | |
1083 "paddw %%mm2, %%mm4 \n\t" | |
1084 "paddw %%mm2, %%mm5 \n\t" | |
1085 "psrlw $6, %%mm4 \n\t" | |
1086 "psrlw $6, %%mm5 \n\t" | |
1087 | |
1088 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 | |
1089 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 | |
1090 | |
1091 "pxor %%mm2, %%mm2 \n\t" | |
1092 "pxor %%mm3, %%mm3 \n\t" | |
1093 | |
1094 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | |
1095 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) | |
1096 "pxor %%mm2, %%mm0 \n\t" | |
1097 "pxor %%mm3, %%mm1 \n\t" | |
1098 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| | |
1099 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| | |
1100 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 | |
1101 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 | |
1102 | |
1103 "pxor %%mm6, %%mm2 \n\t" | |
1104 "pxor %%mm7, %%mm3 \n\t" | |
1105 "pand %%mm2, %%mm4 \n\t" | |
1106 "pand %%mm3, %%mm5 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1107 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 #ifdef HAVE_MMX2 |
2979 | 1109 "pminsw %%mm0, %%mm4 \n\t" |
1110 "pminsw %%mm1, %%mm5 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1111 #else |
2979 | 1112 "movq %%mm4, %%mm2 \n\t" |
1113 "psubusw %%mm0, %%mm2 \n\t" | |
1114 "psubw %%mm2, %%mm4 \n\t" | |
1115 "movq %%mm5, %%mm2 \n\t" | |
1116 "psubusw %%mm1, %%mm2 \n\t" | |
1117 "psubw %%mm2, %%mm5 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1118 #endif |
2979 | 1119 "pxor %%mm6, %%mm4 \n\t" |
1120 "pxor %%mm7, %%mm5 \n\t" | |
1121 "psubw %%mm6, %%mm4 \n\t" | |
1122 "psubw %%mm7, %%mm5 \n\t" | |
1123 "packsswb %%mm5, %%mm4 \n\t" | |
1124 "movq (%0), %%mm0 \n\t" | |
1125 "paddb %%mm4, %%mm0 \n\t" | |
1126 "movq %%mm0, (%0) \n\t" | |
1127 "movq (%0, %1), %%mm0 \n\t" | |
1128 "psubb %%mm4, %%mm0 \n\t" | |
1129 "movq %%mm0, (%0, %1) \n\t" | |
1130 | |
1131 : "+r" (src) | |
1132 : "r" ((long)stride), "m" (c->pQPb) | |
1133 : "%"REG_a, "%"REG_c | |
1134 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1135 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1136 const int l1= stride; |
1137 const int l2= stride + l1; | |
1138 const int l3= stride + l2; | |
1139 const int l4= stride + l3; | |
1140 const int l5= stride + l4; | |
1141 const int l6= stride + l5; | |
1142 const int l7= stride + l6; | |
1143 const int l8= stride + l7; | |
1144 // const int l9= stride + l8; | |
1145 int x; | |
1146 src+= stride*3; | |
1147 for(x=0; x<BLOCK_SIZE; x++) | |
1148 { | |
1149 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
1150 if(ABS(middleEnergy) < 8*c->QP) | |
1151 { | |
1152 const int q=(src[l4] - src[l5])/2; | |
1153 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
1154 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
1155 | |
1156 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
1157 d= MAX(d, 0); | |
1158 | |
1159 d= (5*d + 32) >> 6; | |
1160 d*= SIGN(-middleEnergy); | |
1161 | |
1162 if(q>0) | |
1163 { | |
1164 d= d<0 ? 0 : d; | |
1165 d= d>q ? q : d; | |
1166 } | |
1167 else | |
1168 { | |
1169 d= d>0 ? 0 : d; | |
1170 d= d<q ? q : d; | |
1171 } | |
1172 | |
1173 src[l4]-= d; | |
1174 src[l5]+= d; | |
1175 } | |
1176 src++; | |
1177 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1178 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1179 } |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1180 #endif //HAVE_ALTIVEC |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1181 |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1182 #ifndef HAVE_ALTIVEC |
787 | 1183 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1184 { |
132 | 1185 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1186 asm volatile( |
1187 "pxor %%mm6, %%mm6 \n\t" | |
1188 "pcmpeqb %%mm7, %%mm7 \n\t" | |
1189 "movq %2, %%mm0 \n\t" | |
1190 "punpcklbw %%mm6, %%mm0 \n\t" | |
1191 "psrlw $1, %%mm0 \n\t" | |
1192 "psubw %%mm7, %%mm0 \n\t" | |
1193 "packuswb %%mm0, %%mm0 \n\t" | |
1194 "movq %%mm0, %3 \n\t" | |
1195 | |
1196 "lea (%0, %1), %%"REG_a" \n\t" | |
1197 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
1198 | |
1199 // 0 1 2 3 4 5 6 7 8 9 | |
1200 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1201 |
169 | 1202 #undef FIND_MIN_MAX |
132 | 1203 #ifdef HAVE_MMX2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1204 #define REAL_FIND_MIN_MAX(addr)\ |
2979 | 1205 "movq " #addr ", %%mm0 \n\t"\ |
1206 "pminub %%mm0, %%mm7 \n\t"\ | |
1207 "pmaxub %%mm0, %%mm6 \n\t" | |
132 | 1208 #else |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1209 #define REAL_FIND_MIN_MAX(addr)\ |
2979 | 1210 "movq " #addr ", %%mm0 \n\t"\ |
1211 "movq %%mm7, %%mm1 \n\t"\ | |
1212 "psubusb %%mm0, %%mm6 \n\t"\ | |
1213 "paddb %%mm0, %%mm6 \n\t"\ | |
1214 "psubusb %%mm0, %%mm1 \n\t"\ | |
1215 "psubb %%mm1, %%mm7 \n\t" | |
132 | 1216 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1217 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1218 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1219 FIND_MIN_MAX((%%REGa)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1220 FIND_MIN_MAX((%%REGa, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1221 FIND_MIN_MAX((%%REGa, %1, 2)) |
130 | 1222 FIND_MIN_MAX((%0, %1, 4)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1223 FIND_MIN_MAX((%%REGd)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1224 FIND_MIN_MAX((%%REGd, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1225 FIND_MIN_MAX((%%REGd, %1, 2)) |
130 | 1226 FIND_MIN_MAX((%0, %1, 8)) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1227 |
2979 | 1228 "movq %%mm7, %%mm4 \n\t" |
1229 "psrlq $8, %%mm7 \n\t" | |
167 | 1230 #ifdef HAVE_MMX2 |
2979 | 1231 "pminub %%mm4, %%mm7 \n\t" // min of pixels |
1232 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
1233 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1234 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
1235 "pminub %%mm4, %%mm7 \n\t" | |
167 | 1236 #else |
2979 | 1237 "movq %%mm7, %%mm1 \n\t" |
1238 "psubusb %%mm4, %%mm1 \n\t" | |
1239 "psubb %%mm1, %%mm7 \n\t" | |
1240 "movq %%mm7, %%mm4 \n\t" | |
1241 "psrlq $16, %%mm7 \n\t" | |
1242 "movq %%mm7, %%mm1 \n\t" | |
1243 "psubusb %%mm4, %%mm1 \n\t" | |
1244 "psubb %%mm1, %%mm7 \n\t" | |
1245 "movq %%mm7, %%mm4 \n\t" | |
1246 "psrlq $32, %%mm7 \n\t" | |
1247 "movq %%mm7, %%mm1 \n\t" | |
1248 "psubusb %%mm4, %%mm1 \n\t" | |
1249 "psubb %%mm1, %%mm7 \n\t" | |
167 | 1250 #endif |
1251 | |
1252 | |
2979 | 1253 "movq %%mm6, %%mm4 \n\t" |
1254 "psrlq $8, %%mm6 \n\t" | |
132 | 1255 #ifdef HAVE_MMX2 |
2979 | 1256 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
1257 "pshufw $0xF9, %%mm6, %%mm4 \n\t" | |
1258 "pmaxub %%mm4, %%mm6 \n\t" | |
1259 "pshufw $0xFE, %%mm6, %%mm4 \n\t" | |
1260 "pmaxub %%mm4, %%mm6 \n\t" | |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1261 #else |
2979 | 1262 "psubusb %%mm4, %%mm6 \n\t" |
1263 "paddb %%mm4, %%mm6 \n\t" | |
1264 "movq %%mm6, %%mm4 \n\t" | |
1265 "psrlq $16, %%mm6 \n\t" | |
1266 "psubusb %%mm4, %%mm6 \n\t" | |
1267 "paddb %%mm4, %%mm6 \n\t" | |
1268 "movq %%mm6, %%mm4 \n\t" | |
1269 "psrlq $32, %%mm6 \n\t" | |
1270 "psubusb %%mm4, %%mm6 \n\t" | |
1271 "paddb %%mm4, %%mm6 \n\t" | |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1272 #endif |
2979 | 1273 "movq %%mm6, %%mm0 \n\t" // max |
1274 "psubb %%mm7, %%mm6 \n\t" // max - min | |
1275 "movd %%mm6, %%ecx \n\t" | |
1276 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" | |
1277 " jb 1f \n\t" | |
1278 "lea -24(%%"REG_SP"), %%"REG_c" \n\t" | |
1279 "and "ALIGN_MASK", %%"REG_c" \n\t" | |
1280 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 | |
1281 "punpcklbw %%mm7, %%mm7 \n\t" | |
1282 "punpcklbw %%mm7, %%mm7 \n\t" | |
1283 "punpcklbw %%mm7, %%mm7 \n\t" | |
1284 "movq %%mm7, (%%"REG_c") \n\t" | |
1285 | |
1286 "movq (%0), %%mm0 \n\t" // L10 | |
1287 "movq %%mm0, %%mm1 \n\t" // L10 | |
1288 "movq %%mm0, %%mm2 \n\t" // L10 | |
1289 "psllq $8, %%mm1 \n\t" | |
1290 "psrlq $8, %%mm2 \n\t" | |
1291 "movd -4(%0), %%mm3 \n\t" | |
1292 "movd 8(%0), %%mm4 \n\t" | |
1293 "psrlq $24, %%mm3 \n\t" | |
1294 "psllq $56, %%mm4 \n\t" | |
1295 "por %%mm3, %%mm1 \n\t" // L00 | |
1296 "por %%mm4, %%mm2 \n\t" // L20 | |
1297 "movq %%mm1, %%mm3 \n\t" // L00 | |
1298 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
1299 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
1300 "psubusb %%mm7, %%mm0 \n\t" | |
1301 "psubusb %%mm7, %%mm2 \n\t" | |
1302 "psubusb %%mm7, %%mm3 \n\t" | |
1303 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 | |
1304 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
1305 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
1306 "paddb %%mm2, %%mm0 \n\t" | |
1307 "paddb %%mm3, %%mm0 \n\t" | |
1308 | |
1309 "movq (%%"REG_a"), %%mm2 \n\t" // L11 | |
1310 "movq %%mm2, %%mm3 \n\t" // L11 | |
1311 "movq %%mm2, %%mm4 \n\t" // L11 | |
1312 "psllq $8, %%mm3 \n\t" | |
1313 "psrlq $8, %%mm4 \n\t" | |
1314 "movd -4(%%"REG_a"), %%mm5 \n\t" | |
1315 "movd 8(%%"REG_a"), %%mm6 \n\t" | |
1316 "psrlq $24, %%mm5 \n\t" | |
1317 "psllq $56, %%mm6 \n\t" | |
1318 "por %%mm5, %%mm3 \n\t" // L01 | |
1319 "por %%mm6, %%mm4 \n\t" // L21 | |
1320 "movq %%mm3, %%mm5 \n\t" // L01 | |
1321 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
1322 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
1323 "psubusb %%mm7, %%mm2 \n\t" | |
1324 "psubusb %%mm7, %%mm4 \n\t" | |
1325 "psubusb %%mm7, %%mm5 \n\t" | |
1326 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 | |
1327 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
1328 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
1329 "paddb %%mm4, %%mm2 \n\t" | |
1330 "paddb %%mm5, %%mm2 \n\t" | |
130 | 1331 // 0, 2, 3, 1 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1332 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
2979 | 1333 "movq " #src ", " #sx " \n\t" /* src[0] */\ |
1334 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
1335 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
1336 "psllq $8, " #lx " \n\t"\ | |
1337 "psrlq $8, " #t0 " \n\t"\ | |
1338 "movd -4" #src ", " #t1 " \n\t"\ | |
1339 "psrlq $24, " #t1 " \n\t"\ | |
1340 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
1341 "movd 8" #src ", " #t1 " \n\t"\ | |
1342 "psllq $56, " #t1 " \n\t"\ | |
1343 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
1344 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
1345 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
1346 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
1347 PAVGB(lx, pplx) \ | |
1348 "movq " #lx ", 8(%%"REG_c") \n\t"\ | |
1349 "movq (%%"REG_c"), " #lx " \n\t"\ | |
1350 "psubusb " #lx ", " #t1 " \n\t"\ | |
1351 "psubusb " #lx ", " #t0 " \n\t"\ | |
1352 "psubusb " #lx ", " #sx " \n\t"\ | |
1353 "movq "MANGLE(b00)", " #lx " \n\t"\ | |
1354 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ | |
1355 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
1356 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
1357 "paddb " #t1 ", " #t0 " \n\t"\ | |
1358 "paddb " #t0 ", " #sx " \n\t"\ | |
130 | 1359 \ |
2979 | 1360 PAVGB(plx, pplx) /* filtered */\ |
1361 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
1362 "movq " #t0 ", " #t1 " \n\t" /* dst */\ | |
1363 "psubusb %3, " #t0 " \n\t"\ | |
1364 "paddusb %3, " #t1 " \n\t"\ | |
1365 PMAXUB(t0, pplx)\ | |
1366 PMINUB(t1, pplx, t0)\ | |
1367 "paddb " #sx ", " #ppsx " \n\t"\ | |
1368 "paddb " #psx ", " #ppsx " \n\t"\ | |
1369 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ | |
1370 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
1371 "pcmpeqb " #lx ", " #ppsx " \n\t"\ | |
1372 "pand " #ppsx ", " #pplx " \n\t"\ | |
1373 "pandn " #dst ", " #ppsx " \n\t"\ | |
1374 "por " #pplx ", " #ppsx " \n\t"\ | |
1375 "movq " #ppsx ", " #dst " \n\t"\ | |
1376 "movq 8(%%"REG_c"), " #lx " \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1377 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1378 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1379 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) |
130 | 1380 /* |
1381 0000000 | |
1382 1111111 | |
1383 | |
1384 1111110 | |
1385 1111101 | |
1386 1111100 | |
1387 1111011 | |
1388 1111010 | |
1389 1111001 | |
1390 | |
1391 1111000 | |
1392 1110111 | |
1393 | |
1394 */ | |
2979 | 1395 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) |
1396 DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
1397 DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
1398 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
1399 DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
1400 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
1401 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
1402 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
1403 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
1404 | |
1405 "1: \n\t" | |
1406 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2) | |
1407 : "%"REG_a, "%"REG_d, "%"REG_c | |
1408 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1409 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1410 int y; |
1411 int min=255; | |
1412 int max=0; | |
1413 int avg; | |
1414 uint8_t *p; | |
1415 int s[10]; | |
1416 const int QP2= c->QP/2 + 1; | |
1417 | |
1418 for(y=1; y<9; y++) | |
1419 { | |
1420 int x; | |
1421 p= src + stride*y; | |
1422 for(x=1; x<9; x++) | |
1423 { | |
1424 p++; | |
1425 if(*p > max) max= *p; | |
1426 if(*p < min) min= *p; | |
1427 } | |
1428 } | |
1429 avg= (min + max + 1)>>1; | |
1430 | |
1431 if(max - min <deringThreshold) return; | |
1432 | |
1433 for(y=0; y<10; y++) | |
1434 { | |
1435 int t = 0; | |
1436 | |
1437 if(src[stride*y + 0] > avg) t+= 1; | |
1438 if(src[stride*y + 1] > avg) t+= 2; | |
1439 if(src[stride*y + 2] > avg) t+= 4; | |
1440 if(src[stride*y + 3] > avg) t+= 8; | |
1441 if(src[stride*y + 4] > avg) t+= 16; | |
1442 if(src[stride*y + 5] > avg) t+= 32; | |
1443 if(src[stride*y + 6] > avg) t+= 64; | |
1444 if(src[stride*y + 7] > avg) t+= 128; | |
1445 if(src[stride*y + 8] > avg) t+= 256; | |
1446 if(src[stride*y + 9] > avg) t+= 512; | |
1447 | |
1448 t |= (~t)<<16; | |
1449 t &= (t<<1) & (t>>1); | |
1450 s[y] = t; | |
1451 } | |
1452 | |
1453 for(y=1; y<9; y++) | |
1454 { | |
1455 int t = s[y-1] & s[y] & s[y+1]; | |
1456 t|= t>>16; | |
1457 s[y-1]= t; | |
1458 } | |
1459 | |
1460 for(y=1; y<9; y++) | |
1461 { | |
1462 int x; | |
1463 int t = s[y-1]; | |
1464 | |
1465 p= src + stride*y; | |
1466 for(x=1; x<9; x++) | |
1467 { | |
1468 p++; | |
1469 if(t & (1<<x)) | |
1470 { | |
1471 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
1472 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
1473 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
1474 f= (f + 8)>>4; | |
134 | 1475 |
167 | 1476 #ifdef DEBUG_DERING_THRESHOLD |
2979 | 1477 asm volatile("emms\n\t":); |
1478 { | |
1479 static long long numPixels=0; | |
1480 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
1481 // if((max-min)<20 || (max-min)*QP<200) | |
1482 // if((max-min)*QP < 500) | |
1483 // if(max-min<QP/2) | |
1484 if(max-min < 20) | |
1485 { | |
1486 static int numSkiped=0; | |
1487 static int errorSum=0; | |
1488 static int worstQP=0; | |
1489 static int worstRange=0; | |
1490 static int worstDiff=0; | |
1491 int diff= (f - *p); | |
1492 int absDiff= ABS(diff); | |
1493 int error= diff*diff; | |
1494 | |
1495 if(x==1 || x==8 || y==1 || y==8) continue; | |
1496 | |
1497 numSkiped++; | |
1498 if(absDiff > worstDiff) | |
1499 { | |
1500 worstDiff= absDiff; | |
1501 worstQP= QP; | |
1502 worstRange= max-min; | |
1503 } | |
1504 errorSum+= error; | |
1505 | |
1506 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
1507 { | |
1508 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
1509 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
1510 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
1511 worstDiff, (float)numSkiped/numPixels); | |
1512 } | |
1513 } | |
1514 } | |
167 | 1515 #endif |
2979 | 1516 if (*p + QP2 < f) *p= *p + QP2; |
1517 else if(*p - QP2 > f) *p= *p - QP2; | |
1518 else *p=f; | |
1519 } | |
1520 } | |
1521 } | |
167 | 1522 #ifdef DEBUG_DERING_THRESHOLD |
2979 | 1523 if(max-min < 20) |
1524 { | |
1525 for(y=1; y<9; y++) | |
1526 { | |
1527 int x; | |
1528 int t = 0; | |
1529 p= src + stride*y; | |
1530 for(x=1; x<9; x++) | |
1531 { | |
1532 p++; | |
1533 *p = MIN(*p + 20, 255); | |
1534 } | |
1535 } | |
1536 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
1537 } | |
167 | 1538 #endif |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1539 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1540 } |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1541 #endif //HAVE_ALTIVEC |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1542 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1543 /** |
1109 | 1544 * Deinterlaces the given block by linearly interpolating every second line. |
142 | 1545 * will be called for every 8x8 block and can read & write from line 4-15 |
1546 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1547 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1548 */ |
169 | 1549 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1550 { |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1551 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1552 src+= 4*stride; |
1553 asm volatile( | |
1554 "lea (%0, %1), %%"REG_a" \n\t" | |
1555 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" | |
1556 // 0 1 2 3 4 5 6 7 8 9 | |
1557 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 | |
1558 | |
1559 "movq (%0), %%mm0 \n\t" | |
1560 "movq (%%"REG_a", %1), %%mm1 \n\t" | |
1561 PAVGB(%%mm1, %%mm0) | |
1562 "movq %%mm0, (%%"REG_a") \n\t" | |
1563 "movq (%0, %1, 4), %%mm0 \n\t" | |
1564 PAVGB(%%mm0, %%mm1) | |
1565 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" | |
1566 "movq (%%"REG_c", %1), %%mm1 \n\t" | |
1567 PAVGB(%%mm1, %%mm0) | |
1568 "movq %%mm0, (%%"REG_c") \n\t" | |
1569 "movq (%0, %1, 8), %%mm0 \n\t" | |
1570 PAVGB(%%mm0, %%mm1) | |
1571 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" | |
1572 | |
1573 : : "r" (src), "r" ((long)stride) | |
1574 : "%"REG_a, "%"REG_c | |
1575 ); | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1576 #else |
2979 | 1577 int a, b, x; |
1578 src+= 4*stride; | |
1579 | |
1580 for(x=0; x<2; x++){ | |
1581 a= *(uint32_t*)&src[stride*0]; | |
1582 b= *(uint32_t*)&src[stride*2]; | |
1583 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1584 a= *(uint32_t*)&src[stride*4]; | |
1585 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1586 b= *(uint32_t*)&src[stride*6]; | |
1587 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1588 a= *(uint32_t*)&src[stride*8]; | |
1589 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1590 src += 4; | |
1591 } | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1592 #endif |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1593 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1594 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1595 /** |
1109 | 1596 * Deinterlaces the given block by cubic interpolating every second line. |
142 | 1597 * will be called for every 8x8 block and can read & write from line 4-15 |
1598 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1599 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
1600 * this filter will read lines 3-15 and write 7-13 | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1601 */ |
169 | 1602 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1603 { |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1605 src+= stride*3; |
1606 asm volatile( | |
1607 "lea (%0, %1), %%"REG_a" \n\t" | |
1608 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
1609 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" | |
1610 "add %1, %%"REG_c" \n\t" | |
1611 "pxor %%mm7, %%mm7 \n\t" | |
1612 // 0 1 2 3 4 5 6 7 8 9 10 | |
1613 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1614 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1615 #define REAL_DEINT_CUBIC(a,b,c,d,e)\ |
2979 | 1616 "movq " #a ", %%mm0 \n\t"\ |
1617 "movq " #b ", %%mm1 \n\t"\ | |
1618 "movq " #d ", %%mm2 \n\t"\ | |
1619 "movq " #e ", %%mm3 \n\t"\ | |
1620 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
1621 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
1622 "movq %%mm0, %%mm2 \n\t"\ | |
1623 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1624 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
1625 "movq %%mm1, %%mm3 \n\t"\ | |
1626 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1627 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1628 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
1629 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
1630 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
1631 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
1632 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
1633 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
1634 "packuswb %%mm3, %%mm1 \n\t"\ | |
1635 "movq %%mm1, " #c " \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1636 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1637 |
2979 | 1638 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1)) |
1639 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8)) | |
1640 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc)) | |
1641 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) | |
1642 | |
1643 : : "r" (src), "r" ((long)stride) | |
1644 : "%"REG_a, "%"REG_d, "%"REG_c | |
1645 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1646 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1647 int x; |
1648 src+= stride*3; | |
1649 for(x=0; x<8; x++) | |
1650 { | |
1651 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); | |
1652 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | |
1653 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | |
1654 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | |
1655 src++; | |
1656 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1657 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1658 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1659 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1660 /** |
1109 | 1661 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
142 | 1662 * will be called for every 8x8 block and can read & write from line 4-15 |
1663 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1664 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
787 | 1665 * this filter will read lines 4-13 and write 5-11 |
1666 */ | |
1667 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
1668 { | |
1669 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
2979 | 1670 src+= stride*4; |
1671 asm volatile( | |
1672 "lea (%0, %1), %%"REG_a" \n\t" | |
1673 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
1674 "pxor %%mm7, %%mm7 \n\t" | |
1675 "movq (%2), %%mm0 \n\t" | |
1676 // 0 1 2 3 4 5 6 7 8 9 10 | |
1677 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
787 | 1678 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1679 #define REAL_DEINT_FF(a,b,c,d)\ |
2979 | 1680 "movq " #a ", %%mm1 \n\t"\ |
1681 "movq " #b ", %%mm2 \n\t"\ | |
1682 "movq " #c ", %%mm3 \n\t"\ | |
1683 "movq " #d ", %%mm4 \n\t"\ | |
1684 PAVGB(%%mm3, %%mm1) \ | |
1685 PAVGB(%%mm4, %%mm0) \ | |
1686 "movq %%mm0, %%mm3 \n\t"\ | |
1687 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1688 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1689 "movq %%mm1, %%mm4 \n\t"\ | |
1690 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1691 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
1692 "psllw $2, %%mm1 \n\t"\ | |
1693 "psllw $2, %%mm4 \n\t"\ | |
1694 "psubw %%mm0, %%mm1 \n\t"\ | |
1695 "psubw %%mm3, %%mm4 \n\t"\ | |
1696 "movq %%mm2, %%mm5 \n\t"\ | |
1697 "movq %%mm2, %%mm0 \n\t"\ | |
1698 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1699 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1700 "paddw %%mm2, %%mm1 \n\t"\ | |
1701 "paddw %%mm5, %%mm4 \n\t"\ | |
1702 "psraw $2, %%mm1 \n\t"\ | |
1703 "psraw $2, %%mm4 \n\t"\ | |
1704 "packuswb %%mm4, %%mm1 \n\t"\ | |
1705 "movq %%mm1, " #b " \n\t"\ | |
787 | 1706 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1707 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1708 |
2979 | 1709 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) |
1710 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) | |
1711 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) | |
1712 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) | |
1713 | |
1714 "movq %%mm0, (%2) \n\t" | |
1715 : : "r" (src), "r" ((long)stride), "r"(tmp) | |
1716 : "%"REG_a, "%"REG_d | |
1717 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1718 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1719 int x; |
1720 src+= stride*4; | |
1721 for(x=0; x<8; x++) | |
1722 { | |
1723 int t1= tmp[x]; | |
1724 int t2= src[stride*1]; | |
1725 | |
1726 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); | |
1727 t1= src[stride*4]; | |
1728 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); | |
1729 t2= src[stride*6]; | |
1730 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); | |
1731 t1= src[stride*8]; | |
1732 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); | |
1733 tmp[x]= t1; | |
1734 | |
1735 src++; | |
1736 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1737 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
787 | 1738 } |
1739 | |
1740 /** | |
1157 | 1741 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
1742 * will be called for every 8x8 block and can read & write from line 4-15 | |
1743 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1744 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
1745 * this filter will read lines 4-13 and write 4-11 | |
1746 */ | |
1747 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | |
1748 { | |
1749 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
2979 | 1750 src+= stride*4; |
1751 asm volatile( | |
1752 "lea (%0, %1), %%"REG_a" \n\t" | |
1753 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
1754 "pxor %%mm7, %%mm7 \n\t" | |
1755 "movq (%2), %%mm0 \n\t" | |
1756 "movq (%3), %%mm1 \n\t" | |
1757 // 0 1 2 3 4 5 6 7 8 9 10 | |
1758 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
1157 | 1759 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1760 #define REAL_DEINT_L5(t1,t2,a,b,c)\ |
2979 | 1761 "movq " #a ", %%mm2 \n\t"\ |
1762 "movq " #b ", %%mm3 \n\t"\ | |
1763 "movq " #c ", %%mm4 \n\t"\ | |
1764 PAVGB(t2, %%mm3) \ | |
1765 PAVGB(t1, %%mm4) \ | |
1766 "movq %%mm2, %%mm5 \n\t"\ | |
1767 "movq %%mm2, " #t1 " \n\t"\ | |
1768 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1769 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1770 "movq %%mm2, %%mm6 \n\t"\ | |
1771 "paddw %%mm2, %%mm2 \n\t"\ | |
1772 "paddw %%mm6, %%mm2 \n\t"\ | |
1773 "movq %%mm5, %%mm6 \n\t"\ | |
1774 "paddw %%mm5, %%mm5 \n\t"\ | |
1775 "paddw %%mm6, %%mm5 \n\t"\ | |
1776 "movq %%mm3, %%mm6 \n\t"\ | |
1777 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1778 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
1779 "paddw %%mm3, %%mm3 \n\t"\ | |
1780 "paddw %%mm6, %%mm6 \n\t"\ | |
1781 "paddw %%mm3, %%mm2 \n\t"\ | |
1782 "paddw %%mm6, %%mm5 \n\t"\ | |
1783 "movq %%mm4, %%mm6 \n\t"\ | |
1784 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1785 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
1786 "psubw %%mm4, %%mm2 \n\t"\ | |
1787 "psubw %%mm6, %%mm5 \n\t"\ | |
1788 "psraw $2, %%mm2 \n\t"\ | |
1789 "psraw $2, %%mm5 \n\t"\ | |
1790 "packuswb %%mm5, %%mm2 \n\t"\ | |
1791 "movq %%mm2, " #a " \n\t"\ | |
1157 | 1792 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1793 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1794 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1795 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1796 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1797 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1798 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) |
2967 | 1799 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1800 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1801 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1802 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) |
1157 | 1803 |
2979 | 1804 "movq %%mm0, (%2) \n\t" |
1805 "movq %%mm1, (%3) \n\t" | |
1806 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2) | |
1807 : "%"REG_a, "%"REG_d | |
1808 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1809 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1810 int x; |
1811 src+= stride*4; | |
1812 for(x=0; x<8; x++) | |
1813 { | |
1814 int t1= tmp[x]; | |
1815 int t2= tmp2[x]; | |
1816 int t3= src[0]; | |
1817 | |
1818 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); | |
1819 t1= src[stride*1]; | |
1820 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); | |
1821 t2= src[stride*2]; | |
1822 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); | |
1823 t3= src[stride*3]; | |
1824 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); | |
1825 t1= src[stride*4]; | |
1826 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); | |
1827 t2= src[stride*5]; | |
1828 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); | |
1829 t3= src[stride*6]; | |
1830 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); | |
1831 t1= src[stride*7]; | |
1832 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); | |
1833 | |
1834 tmp[x]= t3; | |
1835 tmp2[x]= t1; | |
1836 | |
1837 src++; | |
1838 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1839 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
1157 | 1840 } |
1841 | |
1842 /** | |
1109 | 1843 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
787 | 1844 * will be called for every 8x8 block and can read & write from line 4-15 |
1845 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1846 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
142 | 1847 * this filter will read lines 4-13 and write 4-11 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1848 */ |
1581 | 1849 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1850 { |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1851 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1852 src+= 4*stride; |
1853 asm volatile( | |
1854 "lea (%0, %1), %%"REG_a" \n\t" | |
1855 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
1856 // 0 1 2 3 4 5 6 7 8 9 | |
1857 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | |
1858 | |
1859 "movq (%2), %%mm0 \n\t" // L0 | |
1860 "movq (%%"REG_a"), %%mm1 \n\t" // L2 | |
1861 PAVGB(%%mm1, %%mm0) // L0+L2 | |
1862 "movq (%0), %%mm2 \n\t" // L1 | |
1863 PAVGB(%%mm2, %%mm0) | |
1864 "movq %%mm0, (%0) \n\t" | |
1865 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 | |
1866 PAVGB(%%mm0, %%mm2) // L1+L3 | |
1867 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |
1868 "movq %%mm2, (%%"REG_a") \n\t" | |
1869 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 | |
1870 PAVGB(%%mm2, %%mm1) // L2+L4 | |
1871 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |
1872 "movq %%mm1, (%%"REG_a", %1) \n\t" | |
1873 "movq (%0, %1, 4), %%mm1 \n\t" // L5 | |
1874 PAVGB(%%mm1, %%mm0) // L3+L5 | |
1875 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |
1876 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" | |
1877 "movq (%%"REG_d"), %%mm0 \n\t" // L6 | |
1878 PAVGB(%%mm0, %%mm2) // L4+L6 | |
1879 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |
1880 "movq %%mm2, (%0, %1, 4) \n\t" | |
1881 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 | |
1882 PAVGB(%%mm2, %%mm1) // L5+L7 | |
1883 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |
1884 "movq %%mm1, (%%"REG_d") \n\t" | |
1885 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 | |
1886 PAVGB(%%mm1, %%mm0) // L6+L8 | |
1887 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 | |
1888 "movq %%mm0, (%%"REG_d", %1) \n\t" | |
1889 "movq (%0, %1, 8), %%mm0 \n\t" // L9 | |
1890 PAVGB(%%mm0, %%mm2) // L7+L9 | |
1891 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 | |
1892 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" | |
1893 "movq %%mm1, (%2) \n\t" | |
1894 | |
1895 : : "r" (src), "r" ((long)stride), "r" (tmp) | |
1896 : "%"REG_a, "%"REG_d | |
1897 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1898 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 1899 int a, b, c, x; |
1900 src+= 4*stride; | |
1901 | |
1902 for(x=0; x<2; x++){ | |
1903 a= *(uint32_t*)&tmp[stride*0]; | |
1904 b= *(uint32_t*)&src[stride*0]; | |
1905 c= *(uint32_t*)&src[stride*1]; | |
1906 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); | |
1907 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1908 | |
1909 a= *(uint32_t*)&src[stride*2]; | |
1910 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); | |
1911 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); | |
1912 | |
1913 b= *(uint32_t*)&src[stride*3]; | |
1914 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); | |
1915 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); | |
1916 | |
1917 c= *(uint32_t*)&src[stride*4]; | |
1918 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); | |
1919 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1920 | |
1921 a= *(uint32_t*)&src[stride*5]; | |
1922 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); | |
1923 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); | |
1924 | |
1925 b= *(uint32_t*)&src[stride*6]; | |
1926 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); | |
1927 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); | |
1928 | |
1929 c= *(uint32_t*)&src[stride*7]; | |
1930 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); | |
1931 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1932 | |
1933 a= *(uint32_t*)&src[stride*8]; | |
1934 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); | |
1935 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); | |
1936 | |
1937 *(uint32_t*)&tmp[stride*0]= c; | |
1938 src += 4; | |
1939 tmp += 4; | |
1940 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
1941 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1942 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1943 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1944 /** |
1109 | 1945 * Deinterlaces the given block by applying a median filter to every second line. |
142 | 1946 * will be called for every 8x8 block and can read & write from line 4-15, |
1947 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1948 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1949 */ |
169 | 1950 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1951 { |
107 | 1952 #ifdef HAVE_MMX |
2979 | 1953 src+= 4*stride; |
107 | 1954 #ifdef HAVE_MMX2 |
2979 | 1955 asm volatile( |
1956 "lea (%0, %1), %%"REG_a" \n\t" | |
1957 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
1958 // 0 1 2 3 4 5 6 7 8 9 | |
1959 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | |
1960 | |
1961 "movq (%0), %%mm0 \n\t" // | |
1962 "movq (%%"REG_a", %1), %%mm2 \n\t" // | |
1963 "movq (%%"REG_a"), %%mm1 \n\t" // | |
1964 "movq %%mm0, %%mm3 \n\t" | |
1965 "pmaxub %%mm1, %%mm0 \n\t" // | |
1966 "pminub %%mm3, %%mm1 \n\t" // | |
1967 "pmaxub %%mm2, %%mm1 \n\t" // | |
1968 "pminub %%mm1, %%mm0 \n\t" | |
1969 "movq %%mm0, (%%"REG_a") \n\t" | |
1970 | |
1971 "movq (%0, %1, 4), %%mm0 \n\t" // | |
1972 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // | |
1973 "movq %%mm2, %%mm3 \n\t" | |
1974 "pmaxub %%mm1, %%mm2 \n\t" // | |
1975 "pminub %%mm3, %%mm1 \n\t" // | |
1976 "pmaxub %%mm0, %%mm1 \n\t" // | |
1977 "pminub %%mm1, %%mm2 \n\t" | |
1978 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" | |
1979 | |
1980 "movq (%%"REG_d"), %%mm2 \n\t" // | |
1981 "movq (%%"REG_d", %1), %%mm1 \n\t" // | |
1982 "movq %%mm2, %%mm3 \n\t" | |
1983 "pmaxub %%mm0, %%mm2 \n\t" // | |
1984 "pminub %%mm3, %%mm0 \n\t" // | |
1985 "pmaxub %%mm1, %%mm0 \n\t" // | |
1986 "pminub %%mm0, %%mm2 \n\t" | |
1987 "movq %%mm2, (%%"REG_d") \n\t" | |
1988 | |
1989 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // | |
1990 "movq (%0, %1, 8), %%mm0 \n\t" // | |
1991 "movq %%mm2, %%mm3 \n\t" | |
1992 "pmaxub %%mm0, %%mm2 \n\t" // | |
1993 "pminub %%mm3, %%mm0 \n\t" // | |
1994 "pmaxub %%mm1, %%mm0 \n\t" // | |
1995 "pminub %%mm0, %%mm2 \n\t" | |
1996 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" | |
1997 | |
1998 | |
1999 : : "r" (src), "r" ((long)stride) | |
2000 : "%"REG_a, "%"REG_d | |
2001 ); | |
107 | 2002 |
2003 #else // MMX without MMX2 | |
2979 | 2004 asm volatile( |
2005 "lea (%0, %1), %%"REG_a" \n\t" | |
2006 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | |
2007 // 0 1 2 3 4 5 6 7 8 9 | |
2008 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | |
2009 "pxor %%mm7, %%mm7 \n\t" | |
107 | 2010 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2011 #define REAL_MEDIAN(a,b,c)\ |
2979 | 2012 "movq " #a ", %%mm0 \n\t"\ |
2013 "movq " #b ", %%mm2 \n\t"\ | |
2014 "movq " #c ", %%mm1 \n\t"\ | |
2015 "movq %%mm0, %%mm3 \n\t"\ | |
2016 "movq %%mm1, %%mm4 \n\t"\ | |
2017 "movq %%mm2, %%mm5 \n\t"\ | |
2018 "psubusb %%mm1, %%mm3 \n\t"\ | |
2019 "psubusb %%mm2, %%mm4 \n\t"\ | |
2020 "psubusb %%mm0, %%mm5 \n\t"\ | |
2021 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
2022 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
2023 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
2024 "movq %%mm3, %%mm6 \n\t"\ | |
2025 "pxor %%mm4, %%mm3 \n\t"\ | |
2026 "pxor %%mm5, %%mm4 \n\t"\ | |
2027 "pxor %%mm6, %%mm5 \n\t"\ | |
2028 "por %%mm3, %%mm1 \n\t"\ | |
2029 "por %%mm4, %%mm2 \n\t"\ | |
2030 "por %%mm5, %%mm0 \n\t"\ | |
2031 "pand %%mm2, %%mm0 \n\t"\ | |
2032 "pand %%mm1, %%mm0 \n\t"\ | |
2033 "movq %%mm0, " #b " \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2034 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2035 |
2979 | 2036 MEDIAN((%0) , (%%REGa) , (%%REGa, %1)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2037 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) |
2979 | 2038 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2039 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2040 |
2979 | 2041 : : "r" (src), "r" ((long)stride) |
2042 : "%"REG_a, "%"REG_d | |
2043 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2044 #endif //HAVE_MMX2 |
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2045 #else //HAVE_MMX |
2979 | 2046 int x, y; |
2047 src+= 4*stride; | |
2048 // FIXME - there should be a way to do a few columns in parallel like w/mmx | |
2049 for(x=0; x<8; x++) | |
2050 { | |
2051 uint8_t *colsrc = src; | |
2052 for (y=0; y<4; y++) | |
2053 { | |
2054 int a, b, c, d, e, f; | |
2055 a = colsrc[0 ]; | |
2056 b = colsrc[stride ]; | |
2057 c = colsrc[stride*2]; | |
2058 d = (a-b)>>31; | |
2059 e = (b-c)>>31; | |
2060 f = (c-a)>>31; | |
2061 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); | |
2062 colsrc += stride*2; | |
2063 } | |
2064 src++; | |
2065 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2066 #endif //HAVE_MMX |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2067 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2068 |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2069 #ifdef HAVE_MMX |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2070 /** |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2071 * transposes and shift the given 8x8 Block into dst1 and dst2 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2072 */ |
169 | 2073 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2074 { |
2979 | 2075 asm( |
2076 "lea (%0, %1), %%"REG_a" \n\t" | |
2077 // 0 1 2 3 4 5 6 7 8 9 | |
2078 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | |
2079 "movq (%0), %%mm0 \n\t" // 12345678 | |
2080 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh | |
2081 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
2082 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
2083 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
2084 | |
2085 "movq (%%"REG_a", %1), %%mm1 \n\t" | |
2086 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" | |
2087 "movq %%mm1, %%mm4 \n\t" | |
2088 "punpcklbw %%mm3, %%mm1 \n\t" | |
2089 "punpckhbw %%mm3, %%mm4 \n\t" | |
2090 | |
2091 "movq %%mm0, %%mm3 \n\t" | |
2092 "punpcklwd %%mm1, %%mm0 \n\t" | |
2093 "punpckhwd %%mm1, %%mm3 \n\t" | |
2094 "movq %%mm2, %%mm1 \n\t" | |
2095 "punpcklwd %%mm4, %%mm2 \n\t" | |
2096 "punpckhwd %%mm4, %%mm1 \n\t" | |
2097 | |
2098 "movd %%mm0, 128(%2) \n\t" | |
2099 "psrlq $32, %%mm0 \n\t" | |
2100 "movd %%mm0, 144(%2) \n\t" | |
2101 "movd %%mm3, 160(%2) \n\t" | |
2102 "psrlq $32, %%mm3 \n\t" | |
2103 "movd %%mm3, 176(%2) \n\t" | |
2104 "movd %%mm3, 48(%3) \n\t" | |
2105 "movd %%mm2, 192(%2) \n\t" | |
2106 "movd %%mm2, 64(%3) \n\t" | |
2107 "psrlq $32, %%mm2 \n\t" | |
2108 "movd %%mm2, 80(%3) \n\t" | |
2109 "movd %%mm1, 96(%3) \n\t" | |
2110 "psrlq $32, %%mm1 \n\t" | |
2111 "movd %%mm1, 112(%3) \n\t" | |
2112 | |
2113 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" | |
2114 | |
2115 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | |
2116 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh | |
2117 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
2118 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
2119 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
2120 | |
2121 "movq (%%"REG_a", %1), %%mm1 \n\t" | |
2122 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" | |
2123 "movq %%mm1, %%mm4 \n\t" | |
2124 "punpcklbw %%mm3, %%mm1 \n\t" | |
2125 "punpckhbw %%mm3, %%mm4 \n\t" | |
2126 | |
2127 "movq %%mm0, %%mm3 \n\t" | |
2128 "punpcklwd %%mm1, %%mm0 \n\t" | |
2129 "punpckhwd %%mm1, %%mm3 \n\t" | |
2130 "movq %%mm2, %%mm1 \n\t" | |
2131 "punpcklwd %%mm4, %%mm2 \n\t" | |
2132 "punpckhwd %%mm4, %%mm1 \n\t" | |
2133 | |
2134 "movd %%mm0, 132(%2) \n\t" | |
2135 "psrlq $32, %%mm0 \n\t" | |
2136 "movd %%mm0, 148(%2) \n\t" | |
2137 "movd %%mm3, 164(%2) \n\t" | |
2138 "psrlq $32, %%mm3 \n\t" | |
2139 "movd %%mm3, 180(%2) \n\t" | |
2140 "movd %%mm3, 52(%3) \n\t" | |
2141 "movd %%mm2, 196(%2) \n\t" | |
2142 "movd %%mm2, 68(%3) \n\t" | |
2143 "psrlq $32, %%mm2 \n\t" | |
2144 "movd %%mm2, 84(%3) \n\t" | |
2145 "movd %%mm1, 100(%3) \n\t" | |
2146 "psrlq $32, %%mm1 \n\t" | |
2147 "movd %%mm1, 116(%3) \n\t" | |
2148 | |
2149 | |
2150 :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2) | |
2151 : "%"REG_a | |
2152 ); | |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2153 } |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2154 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2155 /** |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2156 * transposes the given 8x8 block |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2157 */ |
169 | 2158 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2159 { |
2979 | 2160 asm( |
2161 "lea (%0, %1), %%"REG_a" \n\t" | |
2162 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t" | |
2163 // 0 1 2 3 4 5 6 7 8 9 | |
2164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | |
2165 "movq (%2), %%mm0 \n\t" // 12345678 | |
2166 "movq 16(%2), %%mm1 \n\t" // abcdefgh | |
2167 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
2168 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
2169 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
2170 | |
2171 "movq 32(%2), %%mm1 \n\t" | |
2172 "movq 48(%2), %%mm3 \n\t" | |
2173 "movq %%mm1, %%mm4 \n\t" | |
2174 "punpcklbw %%mm3, %%mm1 \n\t" | |
2175 "punpckhbw %%mm3, %%mm4 \n\t" | |
2176 | |
2177 "movq %%mm0, %%mm3 \n\t" | |
2178 "punpcklwd %%mm1, %%mm0 \n\t" | |
2179 "punpckhwd %%mm1, %%mm3 \n\t" | |
2180 "movq %%mm2, %%mm1 \n\t" | |
2181 "punpcklwd %%mm4, %%mm2 \n\t" | |
2182 "punpckhwd %%mm4, %%mm1 \n\t" | |
2183 | |
2184 "movd %%mm0, (%0) \n\t" | |
2185 "psrlq $32, %%mm0 \n\t" | |
2186 "movd %%mm0, (%%"REG_a") \n\t" | |
2187 "movd %%mm3, (%%"REG_a", %1) \n\t" | |
2188 "psrlq $32, %%mm3 \n\t" | |
2189 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" | |
2190 "movd %%mm2, (%0, %1, 4) \n\t" | |
2191 "psrlq $32, %%mm2 \n\t" | |
2192 "movd %%mm2, (%%"REG_d") \n\t" | |
2193 "movd %%mm1, (%%"REG_d", %1) \n\t" | |
2194 "psrlq $32, %%mm1 \n\t" | |
2195 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" | |
2196 | |
2197 | |
2198 "movq 64(%2), %%mm0 \n\t" // 12345678 | |
2199 "movq 80(%2), %%mm1 \n\t" // abcdefgh | |
2200 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
2201 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
2202 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
2203 | |
2204 "movq 96(%2), %%mm1 \n\t" | |
2205 "movq 112(%2), %%mm3 \n\t" | |
2206 "movq %%mm1, %%mm4 \n\t" | |
2207 "punpcklbw %%mm3, %%mm1 \n\t" | |
2208 "punpckhbw %%mm3, %%mm4 \n\t" | |
2209 | |
2210 "movq %%mm0, %%mm3 \n\t" | |
2211 "punpcklwd %%mm1, %%mm0 \n\t" | |
2212 "punpckhwd %%mm1, %%mm3 \n\t" | |
2213 "movq %%mm2, %%mm1 \n\t" | |
2214 "punpcklwd %%mm4, %%mm2 \n\t" | |
2215 "punpckhwd %%mm4, %%mm1 \n\t" | |
2216 | |
2217 "movd %%mm0, 4(%0) \n\t" | |
2218 "psrlq $32, %%mm0 \n\t" | |
2219 "movd %%mm0, 4(%%"REG_a") \n\t" | |
2220 "movd %%mm3, 4(%%"REG_a", %1) \n\t" | |
2221 "psrlq $32, %%mm3 \n\t" | |
2222 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" | |
2223 "movd %%mm2, 4(%0, %1, 4) \n\t" | |
2224 "psrlq $32, %%mm2 \n\t" | |
2225 "movd %%mm2, 4(%%"REG_d") \n\t" | |
2226 "movd %%mm1, 4(%%"REG_d", %1) \n\t" | |
2227 "psrlq $32, %%mm1 \n\t" | |
2228 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" | |
2229 | |
2230 :: "r" (dst), "r" ((long)dstStride), "r" (src) | |
2231 : "%"REG_a, "%"REG_d | |
2232 ); | |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2233 } |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2234 #endif //HAVE_MMX |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2235 //static long test=0; |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2236 |
2041 | 2237 #ifndef HAVE_ALTIVEC |
943 | 2238 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
2979 | 2239 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
156 | 2240 { |
2979 | 2241 // to save a register (FIXME do this outside of the loops) |
2242 tempBluredPast[127]= maxNoise[0]; | |
2243 tempBluredPast[128]= maxNoise[1]; | |
2244 tempBluredPast[129]= maxNoise[2]; | |
2967 | 2245 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2246 #define FAST_L2_DIFF |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2247 //#define L1_DIFF //u should change the thresholds too if u try that one |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2248 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
2979 | 2249 asm volatile( |
2250 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride | |
2251 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride | |
2252 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | |
2253 // 0 1 2 3 4 5 6 7 8 9 | |
2254 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 | |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2255 //FIXME reorder? |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2256 #ifdef L1_DIFF //needs mmx2 |
2979 | 2257 "movq (%0), %%mm0 \n\t" // L0 |
2258 "psadbw (%1), %%mm0 \n\t" // |L0-R0| | |
2259 "movq (%0, %2), %%mm1 \n\t" // L1 | |
2260 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| | |
2261 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
2262 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| | |
2263 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 | |
2264 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| | |
2265 | |
2266 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | |
2267 "paddw %%mm1, %%mm0 \n\t" | |
2268 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| | |
2269 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 | |
2270 "paddw %%mm2, %%mm0 \n\t" | |
2271 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| | |
2272 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 | |
2273 "paddw %%mm3, %%mm0 \n\t" | |
2274 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| | |
2275 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 | |
2276 "paddw %%mm4, %%mm0 \n\t" | |
2277 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| | |
2278 "paddw %%mm5, %%mm6 \n\t" | |
2279 "paddw %%mm7, %%mm6 \n\t" | |
2280 "paddw %%mm6, %%mm0 \n\t" | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2281 #else //L1_DIFF |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2282 #if defined (FAST_L2_DIFF) |
2979 | 2283 "pcmpeqb %%mm7, %%mm7 \n\t" |
2284 "movq "MANGLE(b80)", %%mm6 \n\t" | |
2285 "pxor %%mm0, %%mm0 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2286 #define REAL_L2_DIFF_CORE(a, b)\ |
2979 | 2287 "movq " #a ", %%mm5 \n\t"\ |
2288 "movq " #b ", %%mm2 \n\t"\ | |
2289 "pxor %%mm7, %%mm2 \n\t"\ | |
2290 PAVGB(%%mm2, %%mm5)\ | |
2291 "paddb %%mm6, %%mm5 \n\t"\ | |
2292 "movq %%mm5, %%mm2 \n\t"\ | |
2293 "psllw $8, %%mm5 \n\t"\ | |
2294 "pmaddwd %%mm5, %%mm5 \n\t"\ | |
2295 "pmaddwd %%mm2, %%mm2 \n\t"\ | |
2296 "paddd %%mm2, %%mm5 \n\t"\ | |
2297 "psrld $14, %%mm5 \n\t"\ | |
2298 "paddd %%mm5, %%mm0 \n\t" | |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2299 |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2300 #else //defined (FAST_L2_DIFF) |
2979 | 2301 "pxor %%mm7, %%mm7 \n\t" |
2302 "pxor %%mm0, %%mm0 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2303 #define REAL_L2_DIFF_CORE(a, b)\ |
2979 | 2304 "movq " #a ", %%mm5 \n\t"\ |
2305 "movq " #b ", %%mm2 \n\t"\ | |
2306 "movq %%mm5, %%mm1 \n\t"\ | |
2307 "movq %%mm2, %%mm3 \n\t"\ | |
2308 "punpcklbw %%mm7, %%mm5 \n\t"\ | |
2309 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
2310 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
2311 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
2312 "psubw %%mm2, %%mm5 \n\t"\ | |
2313 "psubw %%mm3, %%mm1 \n\t"\ | |
2314 "pmaddwd %%mm5, %%mm5 \n\t"\ | |
2315 "pmaddwd %%mm1, %%mm1 \n\t"\ | |
2316 "paddd %%mm1, %%mm5 \n\t"\ | |
2317 "paddd %%mm5, %%mm0 \n\t" | |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2318 |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2319 #endif //defined (FAST_L2_DIFF) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2320 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2321 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2322 |
2979 | 2323 L2_DIFF_CORE((%0) , (%1)) |
2324 L2_DIFF_CORE((%0, %2) , (%1, %2)) | |
2325 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2)) | |
2326 L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa)) | |
2327 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4)) | |
2328 L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd)) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2329 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) |
2979 | 2330 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc)) |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2331 |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2332 #endif //L1_DIFF |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2333 |
2979 | 2334 "movq %%mm0, %%mm4 \n\t" |
2335 "psrlq $32, %%mm0 \n\t" | |
2336 "paddd %%mm0, %%mm4 \n\t" | |
2337 "movd %%mm4, %%ecx \n\t" | |
2338 "shll $2, %%ecx \n\t" | |
2339 "mov %3, %%"REG_d" \n\t" | |
2340 "addl -4(%%"REG_d"), %%ecx \n\t" | |
2341 "addl 4(%%"REG_d"), %%ecx \n\t" | |
2342 "addl -1024(%%"REG_d"), %%ecx \n\t" | |
2343 "addl $4, %%ecx \n\t" | |
2344 "addl 1024(%%"REG_d"), %%ecx \n\t" | |
2345 "shrl $3, %%ecx \n\t" | |
2346 "movl %%ecx, (%%"REG_d") \n\t" | |
2347 | |
2348 // "mov %3, %%"REG_c" \n\t" | |
2349 // "mov %%"REG_c", test \n\t" | |
2350 // "jmp 4f \n\t" | |
2351 "cmpl 512(%%"REG_d"), %%ecx \n\t" | |
2352 " jb 2f \n\t" | |
2353 "cmpl 516(%%"REG_d"), %%ecx \n\t" | |
2354 " jb 1f \n\t" | |
2355 | |
2356 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride | |
2357 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | |
2358 "movq (%0), %%mm0 \n\t" // L0 | |
2359 "movq (%0, %2), %%mm1 \n\t" // L1 | |
2360 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
2361 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 | |
2362 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | |
2363 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 | |
2364 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 | |
2365 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 | |
2366 "movq %%mm0, (%1) \n\t" // L0 | |
2367 "movq %%mm1, (%1, %2) \n\t" // L1 | |
2368 "movq %%mm2, (%1, %2, 2) \n\t" // L2 | |
2369 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 | |
2370 "movq %%mm4, (%1, %2, 4) \n\t" // L4 | |
2371 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 | |
2372 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 | |
2373 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 | |
2374 "jmp 4f \n\t" | |
2375 | |
2376 "1: \n\t" | |
2377 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride | |
2378 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | |
2379 "movq (%0), %%mm0 \n\t" // L0 | |
2380 PAVGB((%1), %%mm0) // L0 | |
2381 "movq (%0, %2), %%mm1 \n\t" // L1 | |
2382 PAVGB((%1, %2), %%mm1) // L1 | |
2383 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
2384 PAVGB((%1, %2, 2), %%mm2) // L2 | |
2385 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 | |
2386 PAVGB((%1, %%REGa), %%mm3) // L3 | |
2387 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | |
2388 PAVGB((%1, %2, 4), %%mm4) // L4 | |
2389 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 | |
2390 PAVGB((%1, %%REGd), %%mm5) // L5 | |
2391 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 | |
2392 PAVGB((%1, %%REGa, 2), %%mm6) // L6 | |
2393 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 | |
2394 PAVGB((%1, %%REGc), %%mm7) // L7 | |
2395 "movq %%mm0, (%1) \n\t" // R0 | |
2396 "movq %%mm1, (%1, %2) \n\t" // R1 | |
2397 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | |
2398 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 | |
2399 "movq %%mm4, (%1, %2, 4) \n\t" // R4 | |
2400 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 | |
2401 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 | |
2402 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 | |
2403 "movq %%mm0, (%0) \n\t" // L0 | |
2404 "movq %%mm1, (%0, %2) \n\t" // L1 | |
2405 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | |
2406 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 | |
2407 "movq %%mm4, (%0, %2, 4) \n\t" // L4 | |
2408 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 | |
2409 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 | |
2410 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 | |
2411 "jmp 4f \n\t" | |
2412 | |
2413 "2: \n\t" | |
2414 "cmpl 508(%%"REG_d"), %%ecx \n\t" | |
2415 " jb 3f \n\t" | |
2416 | |
2417 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride | |
2418 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | |
2419 "movq (%0), %%mm0 \n\t" // L0 | |
2420 "movq (%0, %2), %%mm1 \n\t" // L1 | |
2421 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
2422 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 | |
2423 "movq (%1), %%mm4 \n\t" // R0 | |
2424 "movq (%1, %2), %%mm5 \n\t" // R1 | |
2425 "movq (%1, %2, 2), %%mm6 \n\t" // R2 | |
2426 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 | |
2427 PAVGB(%%mm4, %%mm0) | |
2428 PAVGB(%%mm5, %%mm1) | |
2429 PAVGB(%%mm6, %%mm2) | |
2430 PAVGB(%%mm7, %%mm3) | |
2431 PAVGB(%%mm4, %%mm0) | |
2432 PAVGB(%%mm5, %%mm1) | |
2433 PAVGB(%%mm6, %%mm2) | |
2434 PAVGB(%%mm7, %%mm3) | |
2435 "movq %%mm0, (%1) \n\t" // R0 | |
2436 "movq %%mm1, (%1, %2) \n\t" // R1 | |
2437 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | |
2438 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 | |
2439 "movq %%mm0, (%0) \n\t" // L0 | |
2440 "movq %%mm1, (%0, %2) \n\t" // L1 | |
2441 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | |
2442 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 | |
2443 | |
2444 "movq (%0, %2, 4), %%mm0 \n\t" // L4 | |
2445 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 | |
2446 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 | |
2447 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 | |
2448 "movq (%1, %2, 4), %%mm4 \n\t" // R4 | |
2449 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 | |
2450 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 | |
2451 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 | |
2452 PAVGB(%%mm4, %%mm0) | |
2453 PAVGB(%%mm5, %%mm1) | |
2454 PAVGB(%%mm6, %%mm2) | |
2455 PAVGB(%%mm7, %%mm3) | |
2456 PAVGB(%%mm4, %%mm0) | |
2457 PAVGB(%%mm5, %%mm1) | |
2458 PAVGB(%%mm6, %%mm2) | |
2459 PAVGB(%%mm7, %%mm3) | |
2460 "movq %%mm0, (%1, %2, 4) \n\t" // R4 | |
2461 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 | |
2462 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 | |
2463 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 | |
2464 "movq %%mm0, (%0, %2, 4) \n\t" // L4 | |
2465 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 | |
2466 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 | |
2467 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 | |
2468 "jmp 4f \n\t" | |
2469 | |
2470 "3: \n\t" | |
2471 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride | |
2472 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | |
2473 "movq (%0), %%mm0 \n\t" // L0 | |
2474 "movq (%0, %2), %%mm1 \n\t" // L1 | |
2475 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
2476 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 | |
2477 "movq (%1), %%mm4 \n\t" // R0 | |
2478 "movq (%1, %2), %%mm5 \n\t" // R1 | |
2479 "movq (%1, %2, 2), %%mm6 \n\t" // R2 | |
2480 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 | |
2481 PAVGB(%%mm4, %%mm0) | |
2482 PAVGB(%%mm5, %%mm1) | |
2483 PAVGB(%%mm6, %%mm2) | |
2484 PAVGB(%%mm7, %%mm3) | |
2485 PAVGB(%%mm4, %%mm0) | |
2486 PAVGB(%%mm5, %%mm1) | |
2487 PAVGB(%%mm6, %%mm2) | |
2488 PAVGB(%%mm7, %%mm3) | |
2489 PAVGB(%%mm4, %%mm0) | |
2490 PAVGB(%%mm5, %%mm1) | |
2491 PAVGB(%%mm6, %%mm2) | |
2492 PAVGB(%%mm7, %%mm3) | |
2493 "movq %%mm0, (%1) \n\t" // R0 | |
2494 "movq %%mm1, (%1, %2) \n\t" // R1 | |
2495 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | |
2496 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 | |
2497 "movq %%mm0, (%0) \n\t" // L0 | |
2498 "movq %%mm1, (%0, %2) \n\t" // L1 | |
2499 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | |
2500 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 | |
2501 | |
2502 "movq (%0, %2, 4), %%mm0 \n\t" // L4 | |
2503 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 | |
2504 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 | |
2505 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 | |
2506 "movq (%1, %2, 4), %%mm4 \n\t" // R4 | |
2507 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 | |
2508 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 | |
2509 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 | |
2510 PAVGB(%%mm4, %%mm0) | |
2511 PAVGB(%%mm5, %%mm1) | |
2512 PAVGB(%%mm6, %%mm2) | |
2513 PAVGB(%%mm7, %%mm3) | |
2514 PAVGB(%%mm4, %%mm0) | |
2515 PAVGB(%%mm5, %%mm1) | |
2516 PAVGB(%%mm6, %%mm2) | |
2517 PAVGB(%%mm7, %%mm3) | |
2518 PAVGB(%%mm4, %%mm0) | |
2519 PAVGB(%%mm5, %%mm1) | |
2520 PAVGB(%%mm6, %%mm2) | |
2521 PAVGB(%%mm7, %%mm3) | |
2522 "movq %%mm0, (%1, %2, 4) \n\t" // R4 | |
2523 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 | |
2524 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 | |
2525 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 | |
2526 "movq %%mm0, (%0, %2, 4) \n\t" // L4 | |
2527 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 | |
2528 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 | |
2529 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 | |
2530 | |
2531 "4: \n\t" | |
2532 | |
2533 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast) | |
2534 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" | |
2535 ); | |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2536 //printf("%d\n", test); |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2537 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
788 | 2538 { |
2979 | 2539 int y; |
2540 int d=0; | |
2541 // int sysd=0; | |
2542 int i; | |
2543 | |
2544 for(y=0; y<8; y++) | |
2545 { | |
2546 int x; | |
2547 for(x=0; x<8; x++) | |
2548 { | |
2549 int ref= tempBlured[ x + y*stride ]; | |
2550 int cur= src[ x + y*stride ]; | |
2551 int d1=ref - cur; | |
2552 // if(x==0 || x==7) d1+= d1>>1; | |
2553 // if(y==0 || y==7) d1+= d1>>1; | |
2554 // d+= ABS(d1); | |
2555 d+= d1*d1; | |
2556 // sysd+= d1; | |
2557 } | |
2558 } | |
2559 i=d; | |
2560 d= ( | |
2561 4*d | |
2562 +(*(tempBluredPast-256)) | |
2563 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
2564 +(*(tempBluredPast+256)) | |
2565 +4)>>3; | |
2566 *tempBluredPast=i; | |
2567 // ((*tempBluredPast)*3 + d + 2)>>2; | |
158 | 2568 |
156 | 2569 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
2570 /* | |
2571 Switch between | |
2572 1 0 0 0 0 0 0 (0) | |
2573 64 32 16 8 4 2 1 (1) | |
2574 64 48 36 27 20 15 11 (33) (approx) | |
2575 64 56 49 43 37 33 29 (200) (approx) | |
2576 */ | |
2979 | 2577 if(d > maxNoise[1]) |
2578 { | |
2579 if(d < maxNoise[2]) | |
2580 { | |
2581 for(y=0; y<8; y++) | |
2582 { | |
2583 int x; | |
2584 for(x=0; x<8; x++) | |
2585 { | |
2586 int ref= tempBlured[ x + y*stride ]; | |
2587 int cur= src[ x + y*stride ]; | |
2588 tempBlured[ x + y*stride ]= | |
2589 src[ x + y*stride ]= | |
2590 (ref + cur + 1)>>1; | |
2591 } | |
2592 } | |
2593 } | |
2594 else | |
2595 { | |
2596 for(y=0; y<8; y++) | |
2597 { | |
2598 int x; | |
2599 for(x=0; x<8; x++) | |
2600 { | |
2601 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
2602 } | |
2603 } | |
2604 } | |
2605 } | |
2606 else | |
2607 { | |
2608 if(d < maxNoise[0]) | |
2609 { | |
2610 for(y=0; y<8; y++) | |
2611 { | |
2612 int x; | |
2613 for(x=0; x<8; x++) | |
2614 { | |
2615 int ref= tempBlured[ x + y*stride ]; | |
2616 int cur= src[ x + y*stride ]; | |
2617 tempBlured[ x + y*stride ]= | |
2618 src[ x + y*stride ]= | |
2619 (ref*7 + cur + 4)>>3; | |
2620 } | |
2621 } | |
2622 } | |
2623 else | |
2624 { | |
2625 for(y=0; y<8; y++) | |
2626 { | |
2627 int x; | |
2628 for(x=0; x<8; x++) | |
2629 { | |
2630 int ref= tempBlured[ x + y*stride ]; | |
2631 int cur= src[ x + y*stride ]; | |
2632 tempBlured[ x + y*stride ]= | |
2633 src[ x + y*stride ]= | |
2634 (ref*3 + cur + 2)>>2; | |
2635 } | |
2636 } | |
2637 } | |
2638 } | |
788 | 2639 } |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
2640 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
156 | 2641 } |
2041 | 2642 #endif //HAVE_ALTIVEC |
156 | 2643 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2644 #ifdef HAVE_MMX |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2645 /** |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2646 * accurate deblock filter |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2647 */ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2648 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
2979 | 2649 int64_t dc_mask, eq_mask, both_masks; |
2650 int64_t sums[10*8*2]; | |
2651 src+= step*3; // src points to begin of the 8x8 Block | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2652 //START_TIMER |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2653 asm volatile( |
2979 | 2654 "movq %0, %%mm7 \n\t" |
2655 "movq %1, %%mm6 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2656 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2657 ); |
2967 | 2658 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2659 asm volatile( |
2979 | 2660 "lea (%2, %3), %%"REG_a" \n\t" |
2661 // 0 1 2 3 4 5 6 7 8 9 | |
2662 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 | |
2663 | |
2664 "movq (%2), %%mm0 \n\t" | |
2665 "movq (%%"REG_a"), %%mm1 \n\t" | |
2666 "movq %%mm1, %%mm3 \n\t" | |
2667 "movq %%mm1, %%mm4 \n\t" | |
2668 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece | |
2669 "paddb %%mm7, %%mm0 \n\t" | |
2670 "pcmpgtb %%mm6, %%mm0 \n\t" | |
2671 | |
2672 "movq (%%"REG_a",%3), %%mm2 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2673 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2674 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 2675 "psubb %%mm2, %%mm1 \n\t" |
2676 "paddb %%mm7, %%mm1 \n\t" | |
2677 "pcmpgtb %%mm6, %%mm1 \n\t" | |
2678 "paddb %%mm1, %%mm0 \n\t" | |
2679 | |
2680 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2681 PMAXUB(%%mm1, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2682 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 2683 "psubb %%mm1, %%mm2 \n\t" |
2684 "paddb %%mm7, %%mm2 \n\t" | |
2685 "pcmpgtb %%mm6, %%mm2 \n\t" | |
2686 "paddb %%mm2, %%mm0 \n\t" | |
2687 | |
2688 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" | |
2689 | |
2690 "movq (%2, %3, 4), %%mm2 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2691 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2692 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 2693 "psubb %%mm2, %%mm1 \n\t" |
2694 "paddb %%mm7, %%mm1 \n\t" | |
2695 "pcmpgtb %%mm6, %%mm1 \n\t" | |
2696 "paddb %%mm1, %%mm0 \n\t" | |
2697 | |
2698 "movq (%%"REG_a"), %%mm1 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2699 PMAXUB(%%mm1, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2700 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 2701 "psubb %%mm1, %%mm2 \n\t" |
2702 "paddb %%mm7, %%mm2 \n\t" | |
2703 "pcmpgtb %%mm6, %%mm2 \n\t" | |
2704 "paddb %%mm2, %%mm0 \n\t" | |
2705 | |
2706 "movq (%%"REG_a", %3), %%mm2 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2707 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2708 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 2709 "psubb %%mm2, %%mm1 \n\t" |
2710 "paddb %%mm7, %%mm1 \n\t" | |
2711 "pcmpgtb %%mm6, %%mm1 \n\t" | |
2712 "paddb %%mm1, %%mm0 \n\t" | |
2713 | |
2714 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2715 PMAXUB(%%mm1, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2716 PMINUB(%%mm1, %%mm3, %%mm5) |
2979 | 2717 "psubb %%mm1, %%mm2 \n\t" |
2718 "paddb %%mm7, %%mm2 \n\t" | |
2719 "pcmpgtb %%mm6, %%mm2 \n\t" | |
2720 "paddb %%mm2, %%mm0 \n\t" | |
2721 | |
2722 "movq (%2, %3, 8), %%mm2 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2723 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2724 PMINUB(%%mm2, %%mm3, %%mm5) |
2979 | 2725 "psubb %%mm2, %%mm1 \n\t" |
2726 "paddb %%mm7, %%mm1 \n\t" | |
2727 "pcmpgtb %%mm6, %%mm1 \n\t" | |
2728 "paddb %%mm1, %%mm0 \n\t" | |
2729 | |
2730 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" | |
2731 "psubb %%mm1, %%mm2 \n\t" | |
2732 "paddb %%mm7, %%mm2 \n\t" | |
2733 "pcmpgtb %%mm6, %%mm2 \n\t" | |
2734 "paddb %%mm2, %%mm0 \n\t" | |
2735 "psubusb %%mm3, %%mm4 \n\t" | |
2736 | |
2737 "pxor %%mm6, %%mm6 \n\t" | |
2738 "movq %4, %%mm7 \n\t" // QP,..., QP | |
2739 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
2740 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 | |
2741 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 | |
2742 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 | |
2743 "movq %%mm7, %1 \n\t" | |
2744 | |
2745 "movq %5, %%mm7 \n\t" | |
2746 "punpcklbw %%mm7, %%mm7 \n\t" | |
2747 "punpcklbw %%mm7, %%mm7 \n\t" | |
2748 "punpcklbw %%mm7, %%mm7 \n\t" | |
2749 "psubb %%mm0, %%mm6 \n\t" | |
2750 "pcmpgtb %%mm7, %%mm6 \n\t" | |
2751 "movq %%mm6, %0 \n\t" | |
2752 | |
2753 : "=m" (eq_mask), "=m" (dc_mask) | |
2754 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) | |
2755 : "%"REG_a | |
2756 ); | |
2757 | |
2758 both_masks = dc_mask & eq_mask; | |
2759 | |
2760 if(both_masks){ | |
2761 long offset= -8*step; | |
2762 int64_t *temp_sums= sums; | |
2763 | |
2764 asm volatile( | |
2765 "movq %2, %%mm0 \n\t" // QP,..., QP | |
2766 "pxor %%mm4, %%mm4 \n\t" | |
2767 | |
2768 "movq (%0), %%mm6 \n\t" | |
2769 "movq (%0, %1), %%mm5 \n\t" | |
2770 "movq %%mm5, %%mm1 \n\t" | |
2771 "movq %%mm6, %%mm2 \n\t" | |
2772 "psubusb %%mm6, %%mm5 \n\t" | |
2773 "psubusb %%mm1, %%mm2 \n\t" | |
2774 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
2775 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
2776 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
2777 | |
2778 "pxor %%mm6, %%mm1 \n\t" | |
2779 "pand %%mm0, %%mm1 \n\t" | |
2780 "pxor %%mm1, %%mm6 \n\t" | |
2781 // 0:QP 6:First | |
2782 | |
2783 "movq (%0, %1, 8), %%mm5 \n\t" | |
2784 "add %1, %0 \n\t" // %0 points to line 1 not 0 | |
2785 "movq (%0, %1, 8), %%mm7 \n\t" | |
2786 "movq %%mm5, %%mm1 \n\t" | |
2787 "movq %%mm7, %%mm2 \n\t" | |
2788 "psubusb %%mm7, %%mm5 \n\t" | |
2789 "psubusb %%mm1, %%mm2 \n\t" | |
2790 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
2791 "movq %2, %%mm0 \n\t" // QP,..., QP | |
2792 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
2793 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
2794 | |
2795 "pxor %%mm7, %%mm1 \n\t" | |
2796 "pand %%mm0, %%mm1 \n\t" | |
2797 "pxor %%mm1, %%mm7 \n\t" | |
2798 | |
2799 "movq %%mm6, %%mm5 \n\t" | |
2800 "punpckhbw %%mm4, %%mm6 \n\t" | |
2801 "punpcklbw %%mm4, %%mm5 \n\t" | |
2802 // 4:0 5/6:First 7:Last | |
2803 | |
2804 "movq %%mm5, %%mm0 \n\t" | |
2805 "movq %%mm6, %%mm1 \n\t" | |
2806 "psllw $2, %%mm0 \n\t" | |
2807 "psllw $2, %%mm1 \n\t" | |
2808 "paddw "MANGLE(w04)", %%mm0 \n\t" | |
2809 "paddw "MANGLE(w04)", %%mm1 \n\t" | |
2040 | 2810 |
2811 #define NEXT\ | |
2979 | 2812 "movq (%0), %%mm2 \n\t"\ |
2813 "movq (%0), %%mm3 \n\t"\ | |
2814 "add %1, %0 \n\t"\ | |
2815 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
2816 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
2817 "paddw %%mm2, %%mm0 \n\t"\ | |
2818 "paddw %%mm3, %%mm1 \n\t" | |
2040 | 2819 |
2820 #define PREV\ | |
2979 | 2821 "movq (%0), %%mm2 \n\t"\ |
2822 "movq (%0), %%mm3 \n\t"\ | |
2823 "add %1, %0 \n\t"\ | |
2824 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
2825 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
2826 "psubw %%mm2, %%mm0 \n\t"\ | |
2827 "psubw %%mm3, %%mm1 \n\t" | |
2828 | |
2829 | |
2830 NEXT //0 | |
2831 NEXT //1 | |
2832 NEXT //2 | |
2833 "movq %%mm0, (%3) \n\t" | |
2834 "movq %%mm1, 8(%3) \n\t" | |
2835 | |
2836 NEXT //3 | |
2837 "psubw %%mm5, %%mm0 \n\t" | |
2838 "psubw %%mm6, %%mm1 \n\t" | |
2839 "movq %%mm0, 16(%3) \n\t" | |
2840 "movq %%mm1, 24(%3) \n\t" | |
2841 | |
2842 NEXT //4 | |
2843 "psubw %%mm5, %%mm0 \n\t" | |
2844 "psubw %%mm6, %%mm1 \n\t" | |
2845 "movq %%mm0, 32(%3) \n\t" | |
2846 "movq %%mm1, 40(%3) \n\t" | |
2847 | |
2848 NEXT //5 | |
2849 "psubw %%mm5, %%mm0 \n\t" | |
2850 "psubw %%mm6, %%mm1 \n\t" | |
2851 "movq %%mm0, 48(%3) \n\t" | |
2852 "movq %%mm1, 56(%3) \n\t" | |
2853 | |
2854 NEXT //6 | |
2855 "psubw %%mm5, %%mm0 \n\t" | |
2856 "psubw %%mm6, %%mm1 \n\t" | |
2857 "movq %%mm0, 64(%3) \n\t" | |
2858 "movq %%mm1, 72(%3) \n\t" | |
2859 | |
2860 "movq %%mm7, %%mm6 \n\t" | |
2861 "punpckhbw %%mm4, %%mm7 \n\t" | |
2862 "punpcklbw %%mm4, %%mm6 \n\t" | |
2863 | |
2864 NEXT //7 | |
2865 "mov %4, %0 \n\t" | |
2866 "add %1, %0 \n\t" | |
2867 PREV //0 | |
2868 "movq %%mm0, 80(%3) \n\t" | |
2869 "movq %%mm1, 88(%3) \n\t" | |
2870 | |
2871 PREV //1 | |
2872 "paddw %%mm6, %%mm0 \n\t" | |
2873 "paddw %%mm7, %%mm1 \n\t" | |
2874 "movq %%mm0, 96(%3) \n\t" | |
2875 "movq %%mm1, 104(%3) \n\t" | |
2876 | |
2877 PREV //2 | |
2878 "paddw %%mm6, %%mm0 \n\t" | |
2879 "paddw %%mm7, %%mm1 \n\t" | |
2880 "movq %%mm0, 112(%3) \n\t" | |
2881 "movq %%mm1, 120(%3) \n\t" | |
2882 | |
2883 PREV //3 | |
2884 "paddw %%mm6, %%mm0 \n\t" | |
2885 "paddw %%mm7, %%mm1 \n\t" | |
2886 "movq %%mm0, 128(%3) \n\t" | |
2887 "movq %%mm1, 136(%3) \n\t" | |
2888 | |
2889 PREV //4 | |
2890 "paddw %%mm6, %%mm0 \n\t" | |
2891 "paddw %%mm7, %%mm1 \n\t" | |
2892 "movq %%mm0, 144(%3) \n\t" | |
2893 "movq %%mm1, 152(%3) \n\t" | |
2894 | |
2895 "mov %4, %0 \n\t" //FIXME | |
2896 | |
2897 : "+&r"(src) | |
2898 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src) | |
2899 ); | |
2900 | |
2901 src+= step; // src points to begin of the 8x8 Block | |
2902 | |
2903 asm volatile( | |
2904 "movq %4, %%mm6 \n\t" | |
2905 "pcmpeqb %%mm5, %%mm5 \n\t" | |
2906 "pxor %%mm6, %%mm5 \n\t" | |
2907 "pxor %%mm7, %%mm7 \n\t" | |
2908 | |
2909 "1: \n\t" | |
2910 "movq (%1), %%mm0 \n\t" | |
2911 "movq 8(%1), %%mm1 \n\t" | |
2912 "paddw 32(%1), %%mm0 \n\t" | |
2913 "paddw 40(%1), %%mm1 \n\t" | |
2914 "movq (%0, %3), %%mm2 \n\t" | |
2915 "movq %%mm2, %%mm3 \n\t" | |
2916 "movq %%mm2, %%mm4 \n\t" | |
2917 "punpcklbw %%mm7, %%mm2 \n\t" | |
2918 "punpckhbw %%mm7, %%mm3 \n\t" | |
2919 "paddw %%mm2, %%mm0 \n\t" | |
2920 "paddw %%mm3, %%mm1 \n\t" | |
2921 "paddw %%mm2, %%mm0 \n\t" | |
2922 "paddw %%mm3, %%mm1 \n\t" | |
2923 "psrlw $4, %%mm0 \n\t" | |
2924 "psrlw $4, %%mm1 \n\t" | |
2925 "packuswb %%mm1, %%mm0 \n\t" | |
2926 "pand %%mm6, %%mm0 \n\t" | |
2927 "pand %%mm5, %%mm4 \n\t" | |
2928 "por %%mm4, %%mm0 \n\t" | |
2929 "movq %%mm0, (%0, %3) \n\t" | |
2930 "add $16, %1 \n\t" | |
2931 "add %2, %0 \n\t" | |
2932 " js 1b \n\t" | |
2933 | |
2934 : "+r"(offset), "+r"(temp_sums) | |
2935 : "r" ((long)step), "r"(src - offset), "m"(both_masks) | |
2936 ); | |
2937 }else | |
2938 src+= step; // src points to begin of the 8x8 Block | |
2939 | |
2940 if(eq_mask != -1LL){ | |
2941 uint8_t *temp_src= src; | |
2942 asm volatile( | |
2943 "pxor %%mm7, %%mm7 \n\t" | |
2944 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars | |
2945 "and "ALIGN_MASK", %%"REG_c" \n\t" // align | |
2946 // 0 1 2 3 4 5 6 7 8 9 | |
2947 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 | |
2948 | |
2949 "movq (%0), %%mm0 \n\t" | |
2950 "movq %%mm0, %%mm1 \n\t" | |
2951 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | |
2952 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | |
2953 | |
2954 "movq (%0, %1), %%mm2 \n\t" | |
2955 "lea (%0, %1, 2), %%"REG_a" \n\t" | |
2956 "movq %%mm2, %%mm3 \n\t" | |
2957 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | |
2958 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | |
2959 | |
2960 "movq (%%"REG_a"), %%mm4 \n\t" | |
2961 "movq %%mm4, %%mm5 \n\t" | |
2962 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | |
2963 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | |
2964 | |
2965 "paddw %%mm0, %%mm0 \n\t" // 2L0 | |
2966 "paddw %%mm1, %%mm1 \n\t" // 2H0 | |
2967 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 | |
2968 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 | |
2969 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 | |
2970 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 | |
2971 | |
2972 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | |
2973 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | |
2974 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | |
2975 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | |
2976 | |
2977 "movq (%%"REG_a", %1), %%mm2 \n\t" | |
2978 "movq %%mm2, %%mm3 \n\t" | |
2979 "punpcklbw %%mm7, %%mm2 \n\t" // L3 | |
2980 "punpckhbw %%mm7, %%mm3 \n\t" // H3 | |
2981 | |
2982 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | |
2983 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | |
2984 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
2985 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
2986 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
2987 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
2988 | |
2989 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" | |
2990 "movq %%mm0, %%mm1 \n\t" | |
2991 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | |
2992 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | |
2993 | |
2994 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | |
2995 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | |
2996 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 | |
2997 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 | |
2998 "paddw %%mm4, %%mm4 \n\t" // 2L2 | |
2999 "paddw %%mm5, %%mm5 \n\t" // 2H2 | |
3000 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | |
3001 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | |
3002 | |
3003 "lea (%%"REG_a", %1), %0 \n\t" | |
3004 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | |
3005 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | |
3006 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | |
3007 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3008 //50 opcodes so far |
2979 | 3009 "movq (%0, %1, 2), %%mm2 \n\t" |
3010 "movq %%mm2, %%mm3 \n\t" | |
3011 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | |
3012 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | |
3013 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | |
3014 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | |
3015 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | |
3016 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | |
3017 | |
3018 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" | |
3019 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | |
3020 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | |
3021 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" | |
3022 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | |
3023 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | |
3024 | |
3025 "paddw %%mm0, %%mm0 \n\t" // 2L4 | |
3026 "paddw %%mm1, %%mm1 \n\t" // 2H4 | |
3027 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 | |
3028 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 | |
3029 | |
3030 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | |
3031 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | |
3032 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | |
3033 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | |
3034 | |
3035 "movq (%0, %1, 4), %%mm2 \n\t" | |
3036 "movq %%mm2, %%mm3 \n\t" | |
3037 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | |
3038 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | |
3039 | |
3040 "paddw %%mm2, %%mm2 \n\t" // 2L7 | |
3041 "paddw %%mm3, %%mm3 \n\t" // 2H7 | |
3042 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | |
3043 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | |
3044 | |
3045 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
3046 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3047 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3048 #ifdef HAVE_MMX2 |
2979 | 3049 "movq %%mm7, %%mm6 \n\t" // 0 |
3050 "psubw %%mm0, %%mm6 \n\t" | |
3051 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
3052 "movq %%mm7, %%mm6 \n\t" // 0 | |
3053 "psubw %%mm1, %%mm6 \n\t" | |
3054 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
3055 "movq %%mm7, %%mm6 \n\t" // 0 | |
3056 "psubw %%mm2, %%mm6 \n\t" | |
3057 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
3058 "movq %%mm7, %%mm6 \n\t" // 0 | |
3059 "psubw %%mm3, %%mm6 \n\t" | |
3060 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3061 #else |
2979 | 3062 "movq %%mm7, %%mm6 \n\t" // 0 |
3063 "pcmpgtw %%mm0, %%mm6 \n\t" | |
3064 "pxor %%mm6, %%mm0 \n\t" | |
3065 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
3066 "movq %%mm7, %%mm6 \n\t" // 0 | |
3067 "pcmpgtw %%mm1, %%mm6 \n\t" | |
3068 "pxor %%mm6, %%mm1 \n\t" | |
3069 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
3070 "movq %%mm7, %%mm6 \n\t" // 0 | |
3071 "pcmpgtw %%mm2, %%mm6 \n\t" | |
3072 "pxor %%mm6, %%mm2 \n\t" | |
3073 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
3074 "movq %%mm7, %%mm6 \n\t" // 0 | |
3075 "pcmpgtw %%mm3, %%mm6 \n\t" | |
3076 "pxor %%mm6, %%mm3 \n\t" | |
3077 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3078 #endif |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3079 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3080 #ifdef HAVE_MMX2 |
2979 | 3081 "pminsw %%mm2, %%mm0 \n\t" |
3082 "pminsw %%mm3, %%mm1 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3083 #else |
2979 | 3084 "movq %%mm0, %%mm6 \n\t" |
3085 "psubusw %%mm2, %%mm6 \n\t" | |
3086 "psubw %%mm6, %%mm0 \n\t" | |
3087 "movq %%mm1, %%mm6 \n\t" | |
3088 "psubusw %%mm3, %%mm6 \n\t" | |
3089 "psubw %%mm6, %%mm1 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3090 #endif |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3091 |
2979 | 3092 "movd %2, %%mm2 \n\t" // QP |
3093 "punpcklbw %%mm7, %%mm2 \n\t" | |
3094 | |
3095 "movq %%mm7, %%mm6 \n\t" // 0 | |
3096 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | |
3097 "pxor %%mm6, %%mm4 \n\t" | |
3098 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | |
3099 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | |
3100 "pxor %%mm7, %%mm5 \n\t" | |
3101 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3102 // 100 opcodes |
2979 | 3103 "psllw $3, %%mm2 \n\t" // 8QP |
3104 "movq %%mm2, %%mm3 \n\t" // 8QP | |
3105 "pcmpgtw %%mm4, %%mm2 \n\t" | |
3106 "pcmpgtw %%mm5, %%mm3 \n\t" | |
3107 "pand %%mm2, %%mm4 \n\t" | |
3108 "pand %%mm3, %%mm5 \n\t" | |
3109 | |
3110 | |
3111 "psubusw %%mm0, %%mm4 \n\t" // hd | |
3112 "psubusw %%mm1, %%mm5 \n\t" // ld | |
3113 | |
3114 | |
3115 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 | |
3116 "pmullw %%mm2, %%mm4 \n\t" | |
3117 "pmullw %%mm2, %%mm5 \n\t" | |
3118 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 | |
3119 "paddw %%mm2, %%mm4 \n\t" | |
3120 "paddw %%mm2, %%mm5 \n\t" | |
3121 "psrlw $6, %%mm4 \n\t" | |
3122 "psrlw $6, %%mm5 \n\t" | |
3123 | |
3124 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 | |
3125 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 | |
3126 | |
3127 "pxor %%mm2, %%mm2 \n\t" | |
3128 "pxor %%mm3, %%mm3 \n\t" | |
3129 | |
3130 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | |
3131 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) | |
3132 "pxor %%mm2, %%mm0 \n\t" | |
3133 "pxor %%mm3, %%mm1 \n\t" | |
3134 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| | |
3135 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| | |
3136 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 | |
3137 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 | |
3138 | |
3139 "pxor %%mm6, %%mm2 \n\t" | |
3140 "pxor %%mm7, %%mm3 \n\t" | |
3141 "pand %%mm2, %%mm4 \n\t" | |
3142 "pand %%mm3, %%mm5 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3143 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3144 #ifdef HAVE_MMX2 |
2979 | 3145 "pminsw %%mm0, %%mm4 \n\t" |
3146 "pminsw %%mm1, %%mm5 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3147 #else |
2979 | 3148 "movq %%mm4, %%mm2 \n\t" |
3149 "psubusw %%mm0, %%mm2 \n\t" | |
3150 "psubw %%mm2, %%mm4 \n\t" | |
3151 "movq %%mm5, %%mm2 \n\t" | |
3152 "psubusw %%mm1, %%mm2 \n\t" | |
3153 "psubw %%mm2, %%mm5 \n\t" | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3154 #endif |
2979 | 3155 "pxor %%mm6, %%mm4 \n\t" |
3156 "pxor %%mm7, %%mm5 \n\t" | |
3157 "psubw %%mm6, %%mm4 \n\t" | |
3158 "psubw %%mm7, %%mm5 \n\t" | |
3159 "packsswb %%mm5, %%mm4 \n\t" | |
3160 "movq %3, %%mm1 \n\t" | |
3161 "pandn %%mm4, %%mm1 \n\t" | |
3162 "movq (%0), %%mm0 \n\t" | |
3163 "paddb %%mm1, %%mm0 \n\t" | |
3164 "movq %%mm0, (%0) \n\t" | |
3165 "movq (%0, %1), %%mm0 \n\t" | |
3166 "psubb %%mm1, %%mm0 \n\t" | |
3167 "movq %%mm0, (%0, %1) \n\t" | |
3168 | |
3169 : "+r" (temp_src) | |
3170 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask) | |
3171 : "%"REG_a, "%"REG_c | |
3172 ); | |
3173 } | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3174 /*if(step==16){ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3175 STOP_TIMER("step16") |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3176 }else{ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3177 STOP_TIMER("stepX") |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3178 }*/ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3179 } |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3180 #endif //HAVE_MMX |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3181 |
169 | 3182 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
2979 | 3183 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
96 | 3184 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3185 /** |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3186 * Copies a block from src to dst and fixes the blacklevel |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3187 * levelFix == 0 -> dont touch the brighness & contrast |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3188 */ |
634
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
3189 #undef SCALED_CPY |
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
3190 |
169 | 3191 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
2979 | 3192 int levelFix, int64_t *packedOffsetAndScale) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3193 { |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3194 #ifndef HAVE_MMX |
2979 | 3195 int i; |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3196 #endif |
2979 | 3197 if(levelFix) |
3198 { | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3199 #ifdef HAVE_MMX |
2979 | 3200 asm volatile( |
3201 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset | |
3202 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale | |
3203 "lea (%2,%4), %%"REG_a" \n\t" | |
3204 "lea (%3,%5), %%"REG_d" \n\t" | |
3205 "pxor %%mm4, %%mm4 \n\t" | |
173 | 3206 #ifdef HAVE_MMX2 |
2979 | 3207 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
3208 "movq " #src1 ", %%mm0 \n\t"\ | |
3209 "movq " #src1 ", %%mm5 \n\t"\ | |
3210 "movq " #src2 ", %%mm1 \n\t"\ | |
3211 "movq " #src2 ", %%mm6 \n\t"\ | |
3212 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
3213 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
3214 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
3215 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
3216 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
3217 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
3218 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
3219 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
3220 "psubw %%mm2, %%mm0 \n\t"\ | |
3221 "psubw %%mm2, %%mm5 \n\t"\ | |
3222 "psubw %%mm2, %%mm1 \n\t"\ | |
3223 "psubw %%mm2, %%mm6 \n\t"\ | |
3224 "packuswb %%mm5, %%mm0 \n\t"\ | |
3225 "packuswb %%mm6, %%mm1 \n\t"\ | |
3226 "movq %%mm0, " #dst1 " \n\t"\ | |
3227 "movq %%mm1, " #dst2 " \n\t"\ | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3228 |
173 | 3229 #else //HAVE_MMX2 |
2979 | 3230 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
3231 "movq " #src1 ", %%mm0 \n\t"\ | |
3232 "movq " #src1 ", %%mm5 \n\t"\ | |
3233 "punpcklbw %%mm4, %%mm0 \n\t"\ | |
3234 "punpckhbw %%mm4, %%mm5 \n\t"\ | |
3235 "psubw %%mm2, %%mm0 \n\t"\ | |
3236 "psubw %%mm2, %%mm5 \n\t"\ | |
3237 "movq " #src2 ", %%mm1 \n\t"\ | |
3238 "psllw $6, %%mm0 \n\t"\ | |
3239 "psllw $6, %%mm5 \n\t"\ | |
3240 "pmulhw %%mm3, %%mm0 \n\t"\ | |
3241 "movq " #src2 ", %%mm6 \n\t"\ | |
3242 "pmulhw %%mm3, %%mm5 \n\t"\ | |
3243 "punpcklbw %%mm4, %%mm1 \n\t"\ | |
3244 "punpckhbw %%mm4, %%mm6 \n\t"\ | |
3245 "psubw %%mm2, %%mm1 \n\t"\ | |
3246 "psubw %%mm2, %%mm6 \n\t"\ | |
3247 "psllw $6, %%mm1 \n\t"\ | |
3248 "psllw $6, %%mm6 \n\t"\ | |
3249 "pmulhw %%mm3, %%mm1 \n\t"\ | |
3250 "pmulhw %%mm3, %%mm6 \n\t"\ | |
3251 "packuswb %%mm5, %%mm0 \n\t"\ | |
3252 "packuswb %%mm6, %%mm1 \n\t"\ | |
3253 "movq %%mm0, " #dst1 " \n\t"\ | |
3254 "movq %%mm1, " #dst2 " \n\t"\ | |
166 | 3255 |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
3256 #endif //HAVE_MMX2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3257 #define SCALED_CPY(src1, src2, dst1, dst2)\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3258 REAL_SCALED_CPY(src1, src2, dst1, dst2) |
173 | 3259 |
787 | 3260 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3261 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3262 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) |
2979 | 3263 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" |
3264 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3265 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) |
166 | 3266 |
3267 | |
2979 | 3268 : "=&a" (packedOffsetAndScale) |
3269 : "0" (packedOffsetAndScale), | |
3270 "r"(src), | |
3271 "r"(dst), | |
3272 "r" ((long)srcStride), | |
3273 "r" ((long)dstStride) | |
3274 : "%"REG_d | |
3275 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
3276 #else //HAVE_MMX |
2979 | 3277 for(i=0; i<8; i++) |
3278 memcpy( &(dst[dstStride*i]), | |
3279 &(src[srcStride*i]), BLOCK_SIZE); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
3280 #endif //HAVE_MMX |
2979 | 3281 } |
3282 else | |
3283 { | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3284 #ifdef HAVE_MMX |
2979 | 3285 asm volatile( |
3286 "lea (%0,%2), %%"REG_a" \n\t" | |
3287 "lea (%1,%3), %%"REG_d" \n\t" | |
3288 | |
3289 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
3290 "movq " #src1 ", %%mm0 \n\t"\ | |
3291 "movq " #src2 ", %%mm1 \n\t"\ | |
3292 "movq %%mm0, " #dst1 " \n\t"\ | |
3293 "movq %%mm1, " #dst2 " \n\t"\ | |
166 | 3294 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3295 #define SIMPLE_CPY(src1, src2, dst1, dst2)\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3296 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3297 |
2979 | 3298 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3299 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3300 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) |
2979 | 3301 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" |
3302 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3303 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) |
166 | 3304 |
2979 | 3305 : : "r" (src), |
3306 "r" (dst), | |
3307 "r" ((long)srcStride), | |
3308 "r" ((long)dstStride) | |
3309 : "%"REG_a, "%"REG_d | |
3310 ); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
3311 #else //HAVE_MMX |
2979 | 3312 for(i=0; i<8; i++) |
3313 memcpy( &(dst[dstStride*i]), | |
3314 &(src[srcStride*i]), BLOCK_SIZE); | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
3315 #endif //HAVE_MMX |
2979 | 3316 } |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3317 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3318 |
224 | 3319 /** |
3320 * Duplicates the given 8 src pixels ? times upward | |
3321 */ | |
3322 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
3323 { | |
3324 #ifdef HAVE_MMX | |
2979 | 3325 asm volatile( |
3326 "movq (%0), %%mm0 \n\t" | |
3327 "add %1, %0 \n\t" | |
3328 "movq %%mm0, (%0) \n\t" | |
3329 "movq %%mm0, (%0, %1) \n\t" | |
3330 "movq %%mm0, (%0, %1, 2) \n\t" | |
3331 : "+r" (src) | |
3332 : "r" ((long)-stride) | |
3333 ); | |
224 | 3334 #else |
2979 | 3335 int i; |
3336 uint8_t *p=src; | |
3337 for(i=0; i<3; i++) | |
3338 { | |
3339 p-= stride; | |
3340 memcpy(p, src, 8); | |
3341 } | |
224 | 3342 #endif |
3343 } | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3344 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3345 /** |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3346 * Filters array of bytes (Y or U or V values) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3347 */ |
169 | 3348 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
2979 | 3349 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3350 { |
2979 | 3351 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
3352 int x,y; | |
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3353 #ifdef COMPILE_TIME_MODE |
2979 | 3354 const int mode= COMPILE_TIME_MODE; |
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3355 #else |
2979 | 3356 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3357 #endif |
2979 | 3358 int black=0, white=255; // blackest black and whitest white in the picture |
3359 int QPCorrecture= 256*256; | |
3360 | |
3361 int copyAhead; | |
886
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3362 #ifdef HAVE_MMX |
2979 | 3363 int i; |
886
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3364 #endif |
164 | 3365 |
2979 | 3366 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
3367 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; | |
3368 | |
3369 //FIXME remove | |
3370 uint64_t * const yHistogram= c.yHistogram; | |
3371 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; | |
3372 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; | |
3373 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; | |
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
3374 |
158 | 3375 #ifdef HAVE_MMX |
2979 | 3376 for(i=0; i<57; i++){ |
3377 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; | |
3378 int threshold= offset*2 + 1; | |
3379 c.mmxDcOffset[i]= 0x7F - offset; | |
3380 c.mmxDcThreshold[i]= 0x7F - threshold; | |
3381 c.mmxDcOffset[i]*= 0x0101010101010101LL; | |
3382 c.mmxDcThreshold[i]*= 0x0101010101010101LL; | |
3383 } | |
158 | 3384 #endif |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3385 |
2979 | 3386 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
3387 else if( (mode & LINEAR_BLEND_DEINT_FILTER) | |
3388 || (mode & FFMPEG_DEINT_FILTER) | |
3389 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; | |
3390 else if( (mode & V_DEBLOCK) | |
3391 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
3392 || (mode & MEDIAN_DEINT_FILTER) | |
3393 || (mode & V_A_DEBLOCK)) copyAhead=13; | |
3394 else if(mode & V_X1_FILTER) copyAhead=11; | |
3395 // else if(mode & V_RK1_FILTER) copyAhead=10; | |
3396 else if(mode & DERING) copyAhead=9; | |
3397 else copyAhead=8; | |
3398 | |
3399 copyAhead-= 8; | |
3400 | |
3401 if(!isColor) | |
3402 { | |
3403 uint64_t sum= 0; | |
3404 int i; | |
3405 uint64_t maxClipped; | |
3406 uint64_t clipped; | |
3407 double scale; | |
3408 | |
3409 c.frameNum++; | |
3410 // first frame is fscked so we ignore it | |
3411 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
3412 | |
3413 for(i=0; i<256; i++) | |
3414 { | |
3415 sum+= yHistogram[i]; | |
3416 // printf("%d ", yHistogram[i]); | |
3417 } | |
3418 // printf("\n\n"); | |
3419 | |
3420 /* we allways get a completly black picture first */ | |
3421 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); | |
3422 | |
3423 clipped= sum; | |
3424 for(black=255; black>0; black--) | |
3425 { | |
3426 if(clipped < maxClipped) break; | |
3427 clipped-= yHistogram[black]; | |
3428 } | |
3429 | |
3430 clipped= sum; | |
3431 for(white=0; white<256; white++) | |
3432 { | |
3433 if(clipped < maxClipped) break; | |
3434 clipped-= yHistogram[white]; | |
3435 } | |
3436 | |
3437 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); | |
173 | 3438 |
3439 #ifdef HAVE_MMX2 | |
2979 | 3440 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
3441 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | |
173 | 3442 #else |
2979 | 3443 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
3444 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | |
173 | 3445 #endif |
3446 | |
2979 | 3447 c.packedYOffset|= c.packedYOffset<<32; |
3448 c.packedYOffset|= c.packedYOffset<<16; | |
3449 | |
3450 c.packedYScale|= c.packedYScale<<32; | |
3451 c.packedYScale|= c.packedYScale<<16; | |
3452 | |
3453 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
3454 else QPCorrecture= 256*256; | |
3455 } | |
3456 else | |
3457 { | |
3458 c.packedYScale= 0x0100010001000100LL; | |
3459 c.packedYOffset= 0; | |
3460 QPCorrecture= 256*256; | |
3461 } | |
3462 | |
3463 /* copy & deinterlace first row of blocks */ | |
3464 y=-BLOCK_SIZE; | |
3465 { | |
3466 uint8_t *srcBlock= &(src[y*srcStride]); | |
3467 uint8_t *dstBlock= tempDst + dstStride; | |
3468 | |
3469 // From this point on it is guranteed that we can read and write 16 lines downward | |
3470 // finish 1 block before the next otherwise we might have a problem | |
3471 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
3472 for(x=0; x<width; x+=BLOCK_SIZE) | |
3473 { | |
142 | 3474 |
3475 #ifdef HAVE_MMX2 | |
3476 /* | |
2979 | 3477 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
3478 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
3479 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
3480 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
142 | 3481 */ |
3482 | |
2979 | 3483 asm( |
3484 "mov %4, %%"REG_a" \n\t" | |
3485 "shr $2, %%"REG_a" \n\t" | |
3486 "and $6, %%"REG_a" \n\t" | |
3487 "add %5, %%"REG_a" \n\t" | |
3488 "mov %%"REG_a", %%"REG_d" \n\t" | |
3489 "imul %1, %%"REG_a" \n\t" | |
3490 "imul %3, %%"REG_d" \n\t" | |
3491 "prefetchnta 32(%%"REG_a", %0) \n\t" | |
3492 "prefetcht0 32(%%"REG_d", %2) \n\t" | |
3493 "add %1, %%"REG_a" \n\t" | |
3494 "add %3, %%"REG_d" \n\t" | |
3495 "prefetchnta 32(%%"REG_a", %0) \n\t" | |
3496 "prefetcht0 32(%%"REG_d", %2) \n\t" | |
3497 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), | |
3498 "g" ((long)x), "g" ((long)copyAhead) | |
3499 : "%"REG_a, "%"REG_d | |
3500 ); | |
142 | 3501 |
3502 #elif defined(HAVE_3DNOW) | |
3503 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
2979 | 3504 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
3505 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3506 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3507 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
142 | 3508 */ |
3509 #endif | |
3510 | |
2979 | 3511 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
3512 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); | |
3513 | |
3514 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
3515 | |
3516 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
3517 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); | |
3518 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
3519 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); | |
3520 else if(mode & MEDIAN_DEINT_FILTER) | |
3521 RENAME(deInterlaceMedian)(dstBlock, dstStride); | |
3522 else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
3523 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); | |
3524 else if(mode & FFMPEG_DEINT_FILTER) | |
3525 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
3526 else if(mode & LOWPASS5_DEINT_FILTER) | |
3527 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
3528 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
3529 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); | |
142 | 3530 */ |
2979 | 3531 dstBlock+=8; |
3532 srcBlock+=8; | |
3533 } | |
3534 if(width==ABS(dstStride)) | |
3535 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); | |
3536 else | |
3537 { | |
3538 int i; | |
3539 for(i=0; i<copyAhead; i++) | |
3540 { | |
3541 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); | |
3542 } | |
3543 } | |
3544 } | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3545 |
787 | 3546 //printf("\n"); |
2979 | 3547 for(y=0; y<height; y+=BLOCK_SIZE) |
3548 { | |
3549 //1% speedup if these are here instead of the inner loop | |
3550 uint8_t *srcBlock= &(src[y*srcStride]); | |
3551 uint8_t *dstBlock= &(dst[y*dstStride]); | |
169 | 3552 #ifdef HAVE_MMX |
2979 | 3553 uint8_t *tempBlock1= c.tempBlocks; |
3554 uint8_t *tempBlock2= c.tempBlocks + 8; | |
169 | 3555 #endif |
2979 | 3556 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
3557 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)]; | |
3558 int QP=0; | |
3559 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | |
3560 if not than use a temporary buffer */ | |
3561 if(y+15 >= height) | |
3562 { | |
3563 int i; | |
3564 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with | |
3565 blockcopy to dst later */ | |
3566 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, | |
3567 MAX(height-y-copyAhead, 0), srcStride); | |
3568 | |
3569 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
3570 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
3571 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride)); | |
3572 | |
3573 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ | |
3574 linecpy(tempDst, dstBlock - dstStride, MIN(height-y+1, copyAhead+1), dstStride); | |
3575 | |
3576 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
3577 for(i=height-y+1; i<=copyAhead; i++) | |
3578 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride)); | |
3579 | |
3580 dstBlock= tempDst + dstStride; | |
3581 srcBlock= tempSrc; | |
3582 } | |
787 | 3583 //printf("\n"); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3584 |
2979 | 3585 // From this point on it is guranteed that we can read and write 16 lines downward |
3586 // finish 1 block before the next otherwise we might have a problem | |
3587 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
3588 for(x=0; x<width; x+=BLOCK_SIZE) | |
3589 { | |
3590 const int stride= dstStride; | |
169 | 3591 #ifdef HAVE_MMX |
2979 | 3592 uint8_t *tmpXchg; |
169 | 3593 #endif |
2979 | 3594 if(isColor) |
3595 { | |
3596 QP= QPptr[x>>qpHShift]; | |
3597 c.nonBQP= nonBQPptr[x>>qpHShift]; | |
3598 } | |
3599 else | |
3600 { | |
3601 QP= QPptr[x>>4]; | |
3602 QP= (QP* QPCorrecture + 256*128)>>16; | |
3603 c.nonBQP= nonBQPptr[x>>4]; | |
3604 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; | |
3605 yHistogram[ srcBlock[srcStride*12 + 4] ]++; | |
3606 } | |
3607 c.QP= QP; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3608 #ifdef HAVE_MMX |
2979 | 3609 asm volatile( |
3610 "movd %1, %%mm7 \n\t" | |
3611 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | |
3612 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
3613 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
3614 "movq %%mm7, %0 \n\t" | |
3615 : "=m" (c.pQPb) | |
3616 : "r" (QP) | |
3617 ); | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3618 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3619 |
96 | 3620 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3621 #ifdef HAVE_MMX2 |
126 | 3622 /* |
2979 | 3623 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
3624 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
3625 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
3626 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
126 | 3627 */ |
3628 | |
2979 | 3629 asm( |
3630 "mov %4, %%"REG_a" \n\t" | |
3631 "shr $2, %%"REG_a" \n\t" | |
3632 "and $6, %%"REG_a" \n\t" | |
3633 "add %5, %%"REG_a" \n\t" | |
3634 "mov %%"REG_a", %%"REG_d" \n\t" | |
3635 "imul %1, %%"REG_a" \n\t" | |
3636 "imul %3, %%"REG_d" \n\t" | |
3637 "prefetchnta 32(%%"REG_a", %0) \n\t" | |
3638 "prefetcht0 32(%%"REG_d", %2) \n\t" | |
3639 "add %1, %%"REG_a" \n\t" | |
3640 "add %3, %%"REG_d" \n\t" | |
3641 "prefetchnta 32(%%"REG_a", %0) \n\t" | |
3642 "prefetcht0 32(%%"REG_d", %2) \n\t" | |
3643 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), | |
3644 "g" ((long)x), "g" ((long)copyAhead) | |
3645 : "%"REG_a, "%"REG_d | |
3646 ); | |
126 | 3647 |
96 | 3648 #elif defined(HAVE_3DNOW) |
3649 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
2979 | 3650 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
3651 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3652 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3653 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
96 | 3654 */ |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3655 #endif |
111 | 3656 |
2979 | 3657 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
3658 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); | |
3659 | |
3660 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
3661 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); | |
3662 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
3663 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); | |
3664 else if(mode & MEDIAN_DEINT_FILTER) | |
3665 RENAME(deInterlaceMedian)(dstBlock, dstStride); | |
3666 else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
3667 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); | |
3668 else if(mode & FFMPEG_DEINT_FILTER) | |
3669 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
3670 else if(mode & LOWPASS5_DEINT_FILTER) | |
3671 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
3672 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
3673 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
3674 */ |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3675 |
2979 | 3676 /* only deblock if we have 2 blocks */ |
3677 if(y + 8 < height) | |
3678 { | |
3679 if(mode & V_X1_FILTER) | |
3680 RENAME(vertX1Filter)(dstBlock, stride, &c); | |
3681 else if(mode & V_DEBLOCK) | |
3682 { | |
3683 const int t= RENAME(vertClassify)(dstBlock, stride, &c); | |
3684 | |
3685 if(t==1) | |
3686 RENAME(doVertLowPass)(dstBlock, stride, &c); | |
3687 else if(t==2) | |
3688 RENAME(doVertDefFilter)(dstBlock, stride, &c); | |
3689 }else if(mode & V_A_DEBLOCK){ | |
3690 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); | |
3691 } | |
3692 } | |
130 | 3693 |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3694 #ifdef HAVE_MMX |
2979 | 3695 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3696 #endif |
2979 | 3697 /* check if we have a previous block to deblock it with dstBlock */ |
3698 if(x - 8 >= 0) | |
3699 { | |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3700 #ifdef HAVE_MMX |
2979 | 3701 if(mode & H_X1_FILTER) |
3702 RENAME(vertX1Filter)(tempBlock1, 16, &c); | |
3703 else if(mode & H_DEBLOCK) | |
3704 { | |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3705 //START_TIMER |
2979 | 3706 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3707 //STOP_TIMER("dc & minmax") |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3708 if(t==1) |
2979 | 3709 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
3710 else if(t==2) | |
3711 RENAME(doVertDefFilter)(tempBlock1, 16, &c); | |
3712 }else if(mode & H_A_DEBLOCK){ | |
3713 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); | |
3714 } | |
3715 | |
3716 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); | |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3717 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3718 #else |
2979 | 3719 if(mode & H_X1_FILTER) |
3720 horizX1Filter(dstBlock-4, stride, QP); | |
3721 else if(mode & H_DEBLOCK) | |
3722 { | |
2043 | 3723 #ifdef HAVE_ALTIVEC |
2979 | 3724 unsigned char __attribute__ ((aligned(16))) tempBlock[272]; |
3725 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); | |
3726 | |
3727 const int t=vertClassify_altivec(tempBlock-48, 16, &c); | |
3728 if(t==1) { | |
3729 doVertLowPass_altivec(tempBlock-48, 16, &c); | |
2043 | 3730 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); |
3731 } | |
2979 | 3732 else if(t==2) { |
3733 doVertDefFilter_altivec(tempBlock-48, 16, &c); | |
2043 | 3734 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); |
3735 } | |
3736 #else | |
2979 | 3737 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); |
3738 | |
3739 if(t==1) | |
3740 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); | |
3741 else if(t==2) | |
3742 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); | |
2043 | 3743 #endif |
2979 | 3744 }else if(mode & H_A_DEBLOCK){ |
3745 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); | |
3746 } | |
2978
403183bbb505
Add some comments to #ifdef #else #endif blocks and fix wrong ones.
diego
parents:
2967
diff
changeset
|
3747 #endif //HAVE_MMX |
2979 | 3748 if(mode & DERING) |
3749 { | |
3750 //FIXME filter first line | |
3751 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); | |
3752 } | |
3753 | |
3754 if(mode & TEMP_NOISE_FILTER) | |
3755 { | |
3756 RENAME(tempNoiseReducer)(dstBlock-8, stride, | |
3757 c.tempBlured[isColor] + y*dstStride + x, | |
3758 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
3759 c.ppMode.maxTmpNoise); | |
3760 } | |
3761 } | |
3762 | |
3763 dstBlock+=8; | |
3764 srcBlock+=8; | |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3765 |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3766 #ifdef HAVE_MMX |
2979 | 3767 tmpXchg= tempBlock1; |
3768 tempBlock1= tempBlock2; | |
3769 tempBlock2 = tmpXchg; | |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3770 #endif |
2979 | 3771 } |
3772 | |
3773 if(mode & DERING) | |
3774 { | |
3775 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); | |
3776 } | |
3777 | |
3778 if((mode & TEMP_NOISE_FILTER)) | |
3779 { | |
3780 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, | |
3781 c.tempBlured[isColor] + y*dstStride + x, | |
3782 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
3783 c.ppMode.maxTmpNoise); | |
3784 } | |
3785 | |
3786 /* did we use a tmp buffer for the last lines*/ | |
3787 if(y+15 >= height) | |
3788 { | |
3789 uint8_t *dstBlock= &(dst[y*dstStride]); | |
3790 if(width==ABS(dstStride)) | |
3791 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); | |
3792 else | |
3793 { | |
3794 int i; | |
3795 for(i=0; i<height-y; i++) | |
3796 { | |
3797 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); | |
3798 } | |
3799 } | |
3800 } | |
163 | 3801 /* |
2979 | 3802 for(x=0; x<width; x+=32) |
3803 { | |
3804 volatile int i; | |
3805 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] | |
3806 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
3807 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; | |
3808 // + dstBlock[x +13*dstStride] | |
3809 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
3810 }*/ | |
3811 } | |
96 | 3812 #ifdef HAVE_3DNOW |
2979 | 3813 asm volatile("femms"); |
96 | 3814 #elif defined (HAVE_MMX) |
2979 | 3815 asm volatile("emms"); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3816 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3817 |
163 | 3818 #ifdef DEBUG_BRIGHTNESS |
2979 | 3819 if(!isColor) |
3820 { | |
3821 int max=1; | |
3822 int i; | |
3823 for(i=0; i<256; i++) | |
3824 if(yHistogram[i] > max) max=yHistogram[i]; | |
3825 | |
3826 for(i=1; i<256; i++) | |
3827 { | |
3828 int x; | |
3829 int start=yHistogram[i-1]/(max/256+1); | |
3830 int end=yHistogram[i]/(max/256+1); | |
3831 int inc= end > start ? 1 : -1; | |
3832 for(x=start; x!=end+inc; x+=inc) | |
3833 dst[ i*dstStride + x]+=128; | |
3834 } | |
3835 | |
3836 for(i=0; i<100; i+=2) | |
3837 { | |
3838 dst[ (white)*dstStride + i]+=128; | |
3839 dst[ (black)*dstStride + i]+=128; | |
3840 } | |
3841 | |
3842 } | |
163 | 3843 #endif |
3844 | |
2979 | 3845 *c2= c; //copy local context back |
787 | 3846 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3847 } |