Mercurial > libavcodec.hg
annotate libpostproc/postprocess_template.c @ 2601:c31a28f27d9a libavcodec
increasing precission of the quantization parameter
this is needed as the quantization stepsize for each subband is also in this precission and insignificant changes to the wavelet like scaling its coefficients slightly differently would lead to wildly variing PSNR and bitrate
note, a encoder could also simply choose to leave the least significant bits of the quantization parameters zero which would give the exact previous behaviour except a y very tiny number of bits in the header
author | michael |
---|---|
date | Sat, 09 Apr 2005 22:15:48 +0000 |
parents | ace6e273f318 |
children | 240e17c3cb2d |
rev | line source |
---|---|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
223 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
1109 | 19 /** |
20 * @file postprocess_template.c | |
21 * mmx/mmx2/3dnow postprocess code. | |
22 */ | |
23 | |
24 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
25 #ifdef ARCH_X86_64 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
26 # define REGa rax |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
27 # define REGc rcx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
28 # define REGd rdx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
29 # define REG_a "rax" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
30 # define REG_c "rcx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
31 # define REG_d "rdx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
32 # define REG_SP "rsp" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
33 # define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
34 #else |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
35 # define REGa eax |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
36 # define REGc ecx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
37 # define REGd edx |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
38 # define REG_a "eax" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
39 # define REG_c "ecx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
40 # define REG_d "edx" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
41 # define REG_SP "esp" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
42 # define ALIGN_MASK "$0xFFFFFFF8" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
43 #endif |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
44 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
45 |
169 | 46 #undef PAVGB |
47 #undef PMINUB | |
48 #undef PMAXUB | |
104 | 49 |
50 #ifdef HAVE_MMX2 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
51 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
104 | 52 #elif defined (HAVE_3DNOW) |
2295 | 53 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
104 | 54 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
55 #define PAVGB(a,b) REAL_PAVGB(a,b) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
56 |
134 | 57 #ifdef HAVE_MMX2 |
58 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
59 #elif defined (HAVE_MMX) | |
60 #define PMINUB(b,a,t) \ | |
61 "movq " #a ", " #t " \n\t"\ | |
62 "psubusb " #b ", " #t " \n\t"\ | |
63 "psubb " #t ", " #a " \n\t" | |
64 #endif | |
65 | |
66 #ifdef HAVE_MMX2 | |
67 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
68 #elif defined (HAVE_MMX) | |
69 #define PMAXUB(a,b) \ | |
70 "psubusb " #a ", " #b " \n\t"\ | |
71 "paddb " #a ", " #b " \n\t" | |
72 #endif | |
73 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
74 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
787 | 75 #ifdef HAVE_MMX |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
76 /** |
111 | 77 * Check if the middle 8x8 Block in the given 8x16 block is flat |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
78 */ |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
79 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
80 int numEq= 0, dcOk; |
111 | 81 src+= stride*4; // src points to begin of the 8x8 Block |
119 | 82 asm volatile( |
1331 | 83 "movq %0, %%mm7 \n\t" |
84 "movq %1, %%mm6 \n\t" | |
85 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
86 ); | |
87 | |
88 asm volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
89 "lea (%2, %3), %%"REG_a" \n\t" |
119 | 90 // 0 1 2 3 4 5 6 7 8 9 |
787 | 91 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
791 | 92 |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
93 "movq (%2), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
94 "movq (%%"REG_a"), %%mm1 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
95 "movq %%mm0, %%mm3 \n\t" |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
96 "movq %%mm0, %%mm4 \n\t" |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
97 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
98 PMINUB(%%mm1, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
99 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
100 "paddb %%mm7, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
101 "pcmpgtb %%mm6, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
102 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
103 "movq (%%"REG_a",%3), %%mm2 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
104 PMAXUB(%%mm2, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
105 PMINUB(%%mm2, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
106 "psubb %%mm2, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
107 "paddb %%mm7, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
108 "pcmpgtb %%mm6, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
109 "paddb %%mm1, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
110 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
111 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
112 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
113 PMINUB(%%mm1, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
114 "psubb %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
115 "paddb %%mm7, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
116 "pcmpgtb %%mm6, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
117 "paddb %%mm2, %%mm0 \n\t" |
787 | 118 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
119 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
120 |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
121 "movq (%2, %3, 4), %%mm2 \n\t" |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
122 PMAXUB(%%mm2, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
123 PMINUB(%%mm2, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
124 "psubb %%mm2, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
125 "paddb %%mm7, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
126 "pcmpgtb %%mm6, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
127 "paddb %%mm1, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
128 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
129 "movq (%%"REG_a"), %%mm1 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
130 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
131 PMINUB(%%mm1, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
132 "psubb %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
133 "paddb %%mm7, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
134 "pcmpgtb %%mm6, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
135 "paddb %%mm2, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
136 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
137 "movq (%%"REG_a", %3), %%mm2 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
138 PMAXUB(%%mm2, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
139 PMINUB(%%mm2, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
140 "psubb %%mm2, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
141 "paddb %%mm7, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
142 "pcmpgtb %%mm6, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
143 "paddb %%mm1, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
144 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
145 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
146 PMAXUB(%%mm1, %%mm4) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
147 PMINUB(%%mm1, %%mm3, %%mm5) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
148 "psubb %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
149 "paddb %%mm7, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
150 "pcmpgtb %%mm6, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
151 "paddb %%mm2, %%mm0 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
152 "psubusb %%mm3, %%mm4 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
153 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
154 " \n\t" |
167 | 155 #ifdef HAVE_MMX2 |
156 "pxor %%mm7, %%mm7 \n\t" | |
157 "psadbw %%mm7, %%mm0 \n\t" | |
158 #else | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
159 "movq %%mm0, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
160 "psrlw $8, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
161 "paddb %%mm1, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
162 "movq %%mm0, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
163 "psrlq $16, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
164 "paddb %%mm1, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
165 "movq %%mm0, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
166 "psrlq $32, %%mm0 \n\t" |
167 | 167 "paddb %%mm1, %%mm0 \n\t" |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
168 #endif |
1331 | 169 "movq %4, %%mm7 \n\t" // QP,..., QP |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
170 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
171 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
172 "packssdw %%mm4, %%mm4 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
173 "movd %%mm0, %0 \n\t" |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
174 "movd %%mm4, %1 \n\t" |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
175 |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
176 : "=r" (numEq), "=r" (dcOk) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
177 : "r" (src), "r" ((long)stride), "m" (c->pQPb) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
178 : "%"REG_a |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
179 ); |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
180 |
167 | 181 numEq= (-numEq) &0xFF; |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
182 if(numEq > c->ppMode.flatnessThreshold){ |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
183 if(dcOk) return 0; |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
184 else return 1; |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
185 }else{ |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
186 return 2; |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
187 } |
787 | 188 } |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
189 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
190 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
191 /** |
111 | 192 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
107 | 193 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
194 */ |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
195 #ifndef HAVE_ALTIVEC |
787 | 196 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
197 { |
96 | 198 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
111 | 199 src+= stride*3; |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
200 asm volatile( //"movv %0 %1 %2\n\t" |
787 | 201 "movq %2, %%mm0 \n\t" // QP,..., QP |
202 "pxor %%mm4, %%mm4 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
203 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
204 "movq (%0), %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
205 "movq (%0, %1), %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
206 "movq %%mm5, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
207 "movq %%mm6, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
208 "psubusb %%mm6, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
209 "psubusb %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
210 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
211 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
787 | 212 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
213 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
214 "pand %%mm2, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
215 "pandn %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
216 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
217 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
218 "movq (%0, %1, 8), %%mm5 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
219 "lea (%0, %1, 4), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
220 "lea (%0, %1, 8), %%"REG_c" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
221 "sub %1, %%"REG_c" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
222 "add %1, %0 \n\t" // %0 points to line 1 not 0 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
223 "movq (%0, %1, 8), %%mm7 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
224 "movq %%mm5, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
225 "movq %%mm7, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
226 "psubusb %%mm7, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
227 "psubusb %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
228 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
229 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
787 | 230 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
231 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
232 "pand %%mm2, %%mm7 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
233 "pandn %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
234 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
235 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
236 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
237 // 1 2 3 4 5 6 7 8 |
787 | 238 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
239 // 6 4 2 2 1 1 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
240 // 6 4 4 2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
241 // 6 8 2 |
111 | 242 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
243 "movq (%0, %1), %%mm0 \n\t" // 1 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
244 "movq %%mm0, %%mm1 \n\t" // 1 |
96 | 245 PAVGB(%%mm6, %%mm0) //1 1 /2 |
246 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
247 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
248 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
249 "movq %%mm2, %%mm5 \n\t" // 1 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
250 PAVGB((%%REGa), %%mm2) // 11 /2 |
96 | 251 PAVGB((%0, %1, 2), %%mm2) // 211 /4 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
252 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
253 "movq (%0), %%mm4 \n\t" // 1 |
96 | 254 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
255 PAVGB(%%mm0, %%mm3) //642211 /16 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
256 "movq %%mm3, (%0) \n\t" // X |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
257 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
258 "movq %%mm1, %%mm0 \n\t" // 1 |
96 | 259 PAVGB(%%mm6, %%mm0) //1 1 /2 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
260 "movq %%mm4, %%mm3 \n\t" // 1 |
96 | 261 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
262 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
263 PAVGB((%%REGa), %%mm5) // 211 /4 |
96 | 264 PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
265 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
266 "movq %%mm3, (%0,%1) \n\t" // X |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
267 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
96 | 268 PAVGB(%%mm4, %%mm6) //11 /2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
269 "movq (%%"REG_c"), %%mm0 \n\t" // 1 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
270 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
271 "movq %%mm0, %%mm3 \n\t" // 11/2 |
96 | 272 PAVGB(%%mm1, %%mm0) // 2 11/4 |
273 PAVGB(%%mm6, %%mm0) //222 11/8 | |
274 PAVGB(%%mm2, %%mm0) //22242211/16 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
275 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
276 "movq %%mm0, (%0, %1, 2) \n\t" // X |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
277 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
278 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
279 PAVGB((%%REGc), %%mm0) // 11 /2 |
96 | 280 PAVGB(%%mm0, %%mm6) //11 11 /4 |
281 PAVGB(%%mm1, %%mm4) // 11 /2 | |
282 PAVGB(%%mm2, %%mm1) // 11 /2 | |
283 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
284 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
285 "movq (%%"REG_a"), %%mm5 \n\t" // 1 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
286 "movq %%mm6, (%%"REG_a") \n\t" // X |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
287 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
288 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 |
96 | 289 PAVGB(%%mm7, %%mm6) // 11 /2 |
290 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
291 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
292 PAVGB(%%mm5, %%mm2) // 11 /2 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
293 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
96 | 294 PAVGB(%%mm4, %%mm2) // 112 /4 |
295 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
296 "movq %%mm6, (%0, %1, 4) \n\t" // X |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
297 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
96 | 298 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
299 PAVGB(%%mm4, %%mm5) // 11 /2 | |
300 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
301 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 |
96 | 302 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
303 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
304 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
305 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
306 PAVGB((%%REGc), %%mm2) // 112 4 /8 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
307 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 |
96 | 308 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
309 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
310 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
311 "movq %%mm6, (%%"REG_c") \n\t" // X |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
312 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
96 | 313 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
314 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
315 |
96 | 316 PAVGB(%%mm3, %%mm0) // 112 /4 |
317 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
318 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
319 "sub %1, %0 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
320 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
321 : |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
322 : "r" (src), "r" ((long)stride), "m" (c->pQPb) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
323 : "%"REG_a, "%"REG_c |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
324 ); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
325 #else |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
326 const int l1= stride; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
327 const int l2= stride + l1; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
328 const int l3= stride + l2; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
329 const int l4= stride + l3; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
330 const int l5= stride + l4; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
331 const int l6= stride + l5; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
332 const int l7= stride + l6; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
333 const int l8= stride + l7; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
334 const int l9= stride + l8; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
335 int x; |
111 | 336 src+= stride*3; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
337 for(x=0; x<BLOCK_SIZE; x++) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
338 { |
787 | 339 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
340 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
341 |
2038 | 342 int sums[10]; |
343 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; | |
344 sums[1] = sums[0] - first + src[l4]; | |
345 sums[2] = sums[1] - first + src[l5]; | |
346 sums[3] = sums[2] - first + src[l6]; | |
347 sums[4] = sums[3] - first + src[l7]; | |
348 sums[5] = sums[4] - src[l1] + src[l8]; | |
349 sums[6] = sums[5] - src[l2] + last; | |
350 sums[7] = sums[6] - src[l3] + last; | |
351 sums[8] = sums[7] - src[l4] + last; | |
352 sums[9] = sums[8] - src[l5] + last; | |
353 | |
354 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; | |
355 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; | |
356 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; | |
357 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; | |
358 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; | |
359 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; | |
360 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; | |
361 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
362 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
363 src++; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
364 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
365 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
366 } |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
367 #endif //HAVE_ALTIVEC |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
368 |
787 | 369 #if 0 |
96 | 370 /** |
371 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
372 * values are correctly clipped (MMX2) | |
373 * values are wraparound (C) | |
374 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
375 0 8 16 24 | |
376 x = 8 | |
377 x/2 = 4 | |
378 x/8 = 1 | |
379 1 12 12 23 | |
380 */ | |
169 | 381 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
96 | 382 { |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
383 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
111 | 384 src+= stride*3; |
96 | 385 // FIXME rounding |
386 asm volatile( | |
387 "pxor %%mm7, %%mm7 \n\t" // 0 | |
210 | 388 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
389 "leal (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
390 "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
96 | 391 // 0 1 2 3 4 5 6 7 8 9 |
787 | 392 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
210 | 393 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
96 | 394 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
210 | 395 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
96 | 396 "psrlw $2, %%mm0 \n\t" |
210 | 397 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
96 | 398 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
399 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
400 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 |
96 | 401 "movq %%mm2, %%mm4 \n\t" // line 4 |
402 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
403 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
404 PAVGB(%%mm3, %%mm5) |
96 | 405 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
406 "psubusb %%mm3, %%mm4 \n\t" | |
407 "psubusb %%mm2, %%mm3 \n\t" | |
408 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
409 "psubusb %%mm0, %%mm4 \n\t" | |
410 "pcmpeqb %%mm7, %%mm4 \n\t" | |
411 "pand %%mm4, %%mm5 \n\t" // d/2 | |
412 | |
413 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
414 "paddb %%mm5, %%mm2 \n\t" | |
415 // "psubb %%mm6, %%mm2 \n\t" | |
416 "movq %%mm2, (%0,%1, 4) \n\t" | |
417 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
418 "movq (%%"REG_c"), %%mm2 \n\t" |
96 | 419 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
420 "psubb %%mm5, %%mm2 \n\t" | |
421 // "psubb %%mm6, %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
422 "movq %%mm2, (%%"REG_c") \n\t" |
96 | 423 |
424 "paddb %%mm6, %%mm5 \n\t" | |
425 "psrlw $2, %%mm5 \n\t" | |
210 | 426 "pand "MANGLE(b3F)", %%mm5 \n\t" |
427 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
96 | 428 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
429 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" |
96 | 430 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 |
431 "paddsb %%mm5, %%mm2 \n\t" | |
432 "psubb %%mm6, %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
433 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
434 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
435 "movq (%%"REG_c", %1), %%mm2 \n\t" |
96 | 436 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
437 "psubsb %%mm5, %%mm2 \n\t" | |
438 "psubb %%mm6, %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
439 "movq %%mm2, (%%"REG_c", %1) \n\t" |
96 | 440 |
441 : | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
442 : "r" (src), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
443 : "%"REG_a, "%"REG_c |
96 | 444 ); |
445 #else | |
446 const int l1= stride; | |
447 const int l2= stride + l1; | |
448 const int l3= stride + l2; | |
449 const int l4= stride + l3; | |
450 const int l5= stride + l4; | |
451 const int l6= stride + l5; | |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
452 // const int l7= stride + l6; |
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
453 // const int l8= stride + l7; |
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
454 // const int l9= stride + l8; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
455 int x; |
141 | 456 const int QP15= QP + (QP>>2); |
111 | 457 src+= stride*3; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
458 for(x=0; x<BLOCK_SIZE; x++) |
96 | 459 { |
141 | 460 const int v = (src[x+l5] - src[x+l4]); |
461 if(ABS(v) < QP15) | |
96 | 462 { |
141 | 463 src[x+l3] +=v>>3; |
464 src[x+l4] +=v>>1; | |
465 src[x+l5] -=v>>1; | |
466 src[x+l6] -=v>>3; | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
467 |
96 | 468 } |
469 } | |
470 | |
471 #endif | |
472 } | |
787 | 473 #endif |
96 | 474 |
475 /** | |
476 * Experimental Filter 1 | |
99 | 477 * will not damage linear gradients |
478 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
479 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
480 * MMX2 version does correct clipping C version doesnt |
96 | 481 */ |
787 | 482 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
96 | 483 { |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
484 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
111 | 485 src+= stride*3; |
486 | |
96 | 487 asm volatile( |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
488 "pxor %%mm7, %%mm7 \n\t" // 0 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
489 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
490 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
491 // 0 1 2 3 4 5 6 7 8 9 |
787 | 492 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
493 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
494 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
495 "movq %%mm1, %%mm2 \n\t" // line 4 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
496 "psubusb %%mm0, %%mm1 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
497 "psubusb %%mm2, %%mm0 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
498 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
499 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
500 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
501 "movq %%mm3, %%mm5 \n\t" // line 5 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
502 "psubusb %%mm4, %%mm3 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
503 "psubusb %%mm5, %%mm4 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
504 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
505 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
506 "movq %%mm2, %%mm1 \n\t" // line 4 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
507 "psubusb %%mm5, %%mm2 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
508 "movq %%mm2, %%mm4 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
509 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
510 "psubusb %%mm1, %%mm5 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
511 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
512 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
513 "movq %%mm4, %%mm3 \n\t" // d |
787 | 514 "movq %2, %%mm0 \n\t" |
334 | 515 "paddusb %%mm0, %%mm0 \n\t" |
516 "psubusb %%mm0, %%mm4 \n\t" | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
517 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
210 | 518 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
519 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
520 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
521 PAVGB(%%mm7, %%mm3) // d/2 |
99 | 522 "movq %%mm3, %%mm1 \n\t" // d/2 |
523 PAVGB(%%mm7, %%mm3) // d/4 | |
524 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
525 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
526 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
527 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
528 "psubusb %%mm3, %%mm0 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
529 "pxor %%mm2, %%mm0 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
530 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
531 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
532 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
533 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
534 "paddusb %%mm3, %%mm0 \n\t" |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
535 "pxor %%mm2, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
536 "movq %%mm0, (%%"REG_c") \n\t" // line 5 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
537 |
99 | 538 PAVGB(%%mm7, %%mm1) // d/4 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
539 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
540 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
541 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
99 | 542 "psubusb %%mm1, %%mm0 \n\t" |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
543 "pxor %%mm2, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
544 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
545 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
546 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
547 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
99 | 548 "paddusb %%mm1, %%mm0 \n\t" |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
549 "pxor %%mm2, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
550 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
551 |
99 | 552 PAVGB(%%mm7, %%mm1) // d/8 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
553 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
554 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
555 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
99 | 556 "psubusb %%mm1, %%mm0 \n\t" |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
557 "pxor %%mm2, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
558 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
559 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
560 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
561 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
99 | 562 "paddusb %%mm1, %%mm0 \n\t" |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
563 "pxor %%mm2, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
564 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 |
96 | 565 |
566 : | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
567 : "r" (src), "r" ((long)stride), "m" (co->pQPb) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
568 : "%"REG_a, "%"REG_c |
96 | 569 ); |
570 #else | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
571 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
572 const int l1= stride; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
573 const int l2= stride + l1; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
574 const int l3= stride + l2; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
575 const int l4= stride + l3; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
576 const int l5= stride + l4; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
577 const int l6= stride + l5; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
578 const int l7= stride + l6; |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
579 // const int l8= stride + l7; |
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
580 // const int l9= stride + l8; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
581 int x; |
111 | 582 |
583 src+= stride*3; | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
584 for(x=0; x<BLOCK_SIZE; x++) |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
585 { |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
586 int a= src[l3] - src[l4]; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
587 int b= src[l4] - src[l5]; |
99 | 588 int c= src[l5] - src[l6]; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
589 |
141 | 590 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
591 d= MAX(d, 0); | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
592 |
787 | 593 if(d < co->QP*2) |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
594 { |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
595 int v = d * SIGN(-b); |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
596 |
141 | 597 src[l2] +=v>>3; |
598 src[l3] +=v>>2; | |
599 src[l4] +=(3*v)>>3; | |
600 src[l5] -=(3*v)>>3; | |
601 src[l6] -=v>>2; | |
602 src[l7] -=v>>3; | |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
603 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
604 } |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
605 src++; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
606 } |
96 | 607 #endif |
608 } | |
609 | |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
610 #ifndef HAVE_ALTIVEC |
787 | 611 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
612 { |
163 | 613 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
614 /* | |
615 uint8_t tmp[16]; | |
616 const int l1= stride; | |
617 const int l2= stride + l1; | |
618 const int l3= stride + l2; | |
619 const int l4= (int)tmp - (int)src - stride*3; | |
620 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
621 const int l6= stride*3 + l3; | |
622 const int l7= stride + l6; | |
623 const int l8= stride + l7; | |
624 | |
625 memcpy(tmp, src+stride*7, 8); | |
626 memcpy(tmp+8, src+stride*8, 8); | |
627 */ | |
111 | 628 src+= stride*4; |
163 | 629 asm volatile( |
630 | |
631 #if 0 //sligtly more accurate and slightly slower | |
632 "pxor %%mm7, %%mm7 \n\t" // 0 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
633 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
634 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
163 | 635 // 0 1 2 3 4 5 6 7 |
787 | 636 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
637 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
163 | 638 |
639 | |
640 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
641 "movq (%0), %%mm1 \n\t" // l0 | |
642 "movq %%mm0, %%mm2 \n\t" // l2 | |
643 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
644 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
645 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
646 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
647 "movq (%%"REG_a"), %%mm1 \n\t" // l1 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
648 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 |
163 | 649 "movq %%mm1, %%mm4 \n\t" // l1 |
650 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
651 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
652 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
653 | |
654 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
655 "psubusb %%mm1, %%mm0 \n\t" | |
656 "psubusb %%mm4, %%mm1 \n\t" | |
657 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
658 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
659 | |
660 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
661 "movq %%mm0, %%mm4 \n\t" // l4 | |
662 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
663 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
664 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
665 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
666 "movq (%%"REG_c"), %%mm2 \n\t" // l5 |
163 | 667 "movq %%mm3, %%mm5 \n\t" // l3 |
668 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
669 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
670 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
671 | |
672 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
673 "psubusb %%mm3, %%mm0 \n\t" | |
674 "psubusb %%mm6, %%mm3 \n\t" | |
675 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
676 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
677 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
678 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
679 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 |
163 | 680 "movq %%mm6, %%mm5 \n\t" // l6 |
681 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
682 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
683 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
684 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
685 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 |
163 | 686 "movq %%mm2, %%mm4 \n\t" // l5 |
687 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
688 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
689 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
690 | |
691 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
692 "psubusb %%mm2, %%mm6 \n\t" | |
693 "psubusb %%mm4, %%mm2 \n\t" | |
694 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
695 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
696 | |
697 | |
698 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
787 | 699 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
210 | 700 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
163 | 701 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
702 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
703 "pand %%mm4, %%mm3 \n\t" | |
704 | |
705 "movq %%mm3, %%mm1 \n\t" | |
210 | 706 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
163 | 707 PAVGB(%%mm7, %%mm3) |
708 PAVGB(%%mm7, %%mm3) | |
709 "paddusb %%mm1, %%mm3 \n\t" | |
210 | 710 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
163 | 711 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
712 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 |
163 | 713 "movq (%0, %1, 4), %%mm5 \n\t" //l4 |
714 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
715 "psubusb %%mm6, %%mm5 \n\t" | |
716 "psubusb %%mm4, %%mm6 \n\t" | |
717 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
718 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
719 "pxor %%mm6, %%mm0 \n\t" | |
720 "pand %%mm0, %%mm3 \n\t" | |
721 PMINUB(%%mm5, %%mm3, %%mm0) | |
722 | |
210 | 723 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
163 | 724 PAVGB(%%mm7, %%mm3) |
725 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
726 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
163 | 727 "movq (%0, %1, 4), %%mm2 \n\t" |
728 "pxor %%mm6, %%mm0 \n\t" | |
729 "pxor %%mm6, %%mm2 \n\t" | |
730 "psubb %%mm3, %%mm0 \n\t" | |
731 "paddb %%mm3, %%mm2 \n\t" | |
732 "pxor %%mm6, %%mm0 \n\t" | |
733 "pxor %%mm6, %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
734 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" |
163 | 735 "movq %%mm2, (%0, %1, 4) \n\t" |
736 #endif | |
737 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
738 "lea (%0, %1), %%"REG_a" \n\t" |
163 | 739 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 |
740 // 0 1 2 3 4 5 6 7 | |
787 | 741 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
742 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
163 | 743 |
744 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
745 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 |
163 | 746 "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
747 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
748 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
749 // mm1=-l3-1, mm0=128-q | |
750 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
751 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
752 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 |
163 | 753 "pxor %%mm6, %%mm2 \n\t" // -l5-1 |
754 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
210 | 755 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
756 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
163 | 757 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
758 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
759 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
760 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
761 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
762 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
763 "movq (%%"REG_a"), %%mm2 \n\t" // l1 |
163 | 764 "pxor %%mm6, %%mm2 \n\t" // -l1-1 |
765 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
766 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
210 | 767 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
163 | 768 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
769 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
770 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
771 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
772 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
773 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
774 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 |
163 | 775 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
776 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
210 | 777 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
163 | 778 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
779 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
780 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
781 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
782 | |
210 | 783 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
784 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
163 | 785 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
786 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
787 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
788 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
789 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
790 | |
791 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
792 | |
210 | 793 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
787 | 794 "movq %2, %%mm2 \n\t" // QP |
163 | 795 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
796 "psubb %%mm6, %%mm2 \n\t" | |
797 | |
798 "movq %%mm4, %%mm1 \n\t" | |
799 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
800 "pxor %%mm1, %%mm4 \n\t" | |
801 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
802 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
803 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
804 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
805 | |
806 "movq %%mm4, %%mm3 \n\t" // d | |
210 | 807 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
163 | 808 PAVGB(%%mm7, %%mm4) // d/32 |
809 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
810 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
811 "pand %%mm2, %%mm4 \n\t" | |
812 | |
210 | 813 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
163 | 814 "psubb %%mm0, %%mm5 \n\t" // q |
815 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
816 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
817 "pxor %%mm7, %%mm5 \n\t" | |
818 | |
819 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
820 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
821 | |
822 "pand %%mm7, %%mm4 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
823 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
163 | 824 "movq (%0, %1, 4), %%mm2 \n\t" |
825 "pxor %%mm1, %%mm0 \n\t" | |
826 "pxor %%mm1, %%mm2 \n\t" | |
827 "paddb %%mm4, %%mm0 \n\t" | |
828 "psubb %%mm4, %%mm2 \n\t" | |
829 "pxor %%mm1, %%mm0 \n\t" | |
830 "pxor %%mm1, %%mm2 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
831 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" |
163 | 832 "movq %%mm2, (%0, %1, 4) \n\t" |
833 | |
834 : | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
835 : "r" (src), "r" ((long)stride), "m" (c->pQPb) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
836 : "%"REG_a, "%"REG_c |
163 | 837 ); |
838 | |
839 /* | |
840 { | |
841 int x; | |
842 src-= stride; | |
843 for(x=0; x<BLOCK_SIZE; x++) | |
844 { | |
845 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
846 if(ABS(middleEnergy)< 8*QP) | |
847 { | |
848 const int q=(src[l4] - src[l5])/2; | |
849 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
850 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
851 | |
852 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
853 d= MAX(d, 0); | |
854 | |
855 d= (5*d + 32) >> 6; | |
856 d*= SIGN(-middleEnergy); | |
857 | |
858 if(q>0) | |
859 { | |
860 d= d<0 ? 0 : d; | |
861 d= d>q ? q : d; | |
862 } | |
863 else | |
864 { | |
865 d= d>0 ? 0 : d; | |
866 d= d<q ? q : d; | |
867 } | |
868 | |
869 src[l4]-= d; | |
870 src[l5]+= d; | |
871 } | |
872 src++; | |
873 } | |
874 src-=8; | |
875 for(x=0; x<8; x++) | |
876 { | |
877 int y; | |
878 for(y=4; y<6; y++) | |
879 { | |
880 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
881 int ad= ABS(d); | |
882 static int max=0; | |
883 static int sum=0; | |
884 static int num=0; | |
885 static int bias=0; | |
886 | |
887 if(max<ad) max=ad; | |
888 sum+= ad>3 ? 1 : 0; | |
889 if(ad>3) | |
890 { | |
891 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
892 } | |
893 if(y==4) bias+=d; | |
894 num++; | |
895 if(num%1000000 == 0) | |
896 { | |
897 printf(" %d %d %d %d\n", num, sum, max, bias); | |
898 } | |
899 } | |
900 } | |
901 } | |
902 */ | |
903 #elif defined (HAVE_MMX) | |
904 src+= stride*4; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
905 asm volatile( |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
906 "pxor %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
907 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
908 "and "ALIGN_MASK", %%"REG_c" \n\t" // align |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
909 // 0 1 2 3 4 5 6 7 |
787 | 910 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
911 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
912 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
913 "movq (%0), %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
914 "movq %%mm0, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
915 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
916 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
917 |
810 | 918 "movq (%0, %1), %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
919 "lea (%0, %1, 2), %%"REG_a" \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
920 "movq %%mm2, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
921 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
922 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
923 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
924 "movq (%%"REG_a"), %%mm4 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
925 "movq %%mm4, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
926 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
927 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
928 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
929 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
930 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
931 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
932 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
933 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
934 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
935 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
936 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
937 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
940 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
941 "movq (%%"REG_a", %1), %%mm2 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
942 "movq %%mm2, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
943 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
944 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
945 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
946 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
947 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
948 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
949 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
950 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
951 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
952 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
953 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
954 "movq %%mm0, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
955 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
956 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
957 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
958 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
959 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
960 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
961 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
962 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
963 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
964 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
965 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
966 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
967 "lea (%%"REG_a", %1), %0 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
968 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
969 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
970 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
971 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 //50 opcodes so far |
810 | 973 "movq (%0, %1, 2), %%mm2 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
974 "movq %%mm2, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
975 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
976 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
977 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
978 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
979 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
980 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
981 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
982 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
983 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
984 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
985 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
986 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
987 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
988 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
989 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
990 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
991 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
992 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
993 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
994 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
995 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
996 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
997 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
998 |
810 | 999 "movq (%0, %1, 4), %%mm2 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1000 "movq %%mm2, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1001 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1002 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1003 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1004 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1005 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1006 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1007 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1008 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1009 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1010 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
140 | 1011 |
1012 #ifdef HAVE_MMX2 | |
1013 "movq %%mm7, %%mm6 \n\t" // 0 | |
1014 "psubw %%mm0, %%mm6 \n\t" | |
1015 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
1016 "movq %%mm7, %%mm6 \n\t" // 0 | |
1017 "psubw %%mm1, %%mm6 \n\t" | |
1018 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
1019 "movq %%mm7, %%mm6 \n\t" // 0 | |
1020 "psubw %%mm2, %%mm6 \n\t" | |
1021 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
1022 "movq %%mm7, %%mm6 \n\t" // 0 | |
1023 "psubw %%mm3, %%mm6 \n\t" | |
1024 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
1025 #else | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1026 "movq %%mm7, %%mm6 \n\t" // 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1027 "pcmpgtw %%mm0, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1028 "pxor %%mm6, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1029 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1030 "movq %%mm7, %%mm6 \n\t" // 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1031 "pcmpgtw %%mm1, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1032 "pxor %%mm6, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1033 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1034 "movq %%mm7, %%mm6 \n\t" // 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1035 "pcmpgtw %%mm2, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1036 "pxor %%mm6, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1037 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1038 "movq %%mm7, %%mm6 \n\t" // 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1039 "pcmpgtw %%mm3, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1040 "pxor %%mm6, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1041 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
140 | 1042 #endif |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 #ifdef HAVE_MMX2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1045 "pminsw %%mm2, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1046 "pminsw %%mm3, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 #else |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1048 "movq %%mm0, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1049 "psubusw %%mm2, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1050 "psubw %%mm6, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1051 "movq %%mm1, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1052 "psubusw %%mm3, %%mm6 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1053 "psubw %%mm6, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1054 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1055 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
1056 "movd %2, %%mm2 \n\t" // QP |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
1057 "punpcklbw %%mm7, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
1058 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1059 "movq %%mm7, %%mm6 \n\t" // 0 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1060 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1061 "pxor %%mm6, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1062 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1063 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1064 "pxor %%mm7, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1065 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1066 // 100 opcodes |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1067 "psllw $3, %%mm2 \n\t" // 8QP |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1068 "movq %%mm2, %%mm3 \n\t" // 8QP |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1069 "pcmpgtw %%mm4, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1070 "pcmpgtw %%mm5, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1071 "pand %%mm2, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1072 "pand %%mm3, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1073 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1074 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1075 "psubusw %%mm0, %%mm4 \n\t" // hd |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1076 "psubusw %%mm1, %%mm5 \n\t" // ld |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1077 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1078 |
211 | 1079 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1080 "pmullw %%mm2, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1081 "pmullw %%mm2, %%mm5 \n\t" |
211 | 1082 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1083 "paddw %%mm2, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1084 "paddw %%mm2, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1085 "psrlw $6, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1086 "psrlw $6, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1087 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1088 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1089 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1090 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1091 "pxor %%mm2, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1092 "pxor %%mm3, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1093 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1094 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1095 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1096 "pxor %%mm2, %%mm0 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1097 "pxor %%mm3, %%mm1 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1098 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1099 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1100 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1101 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1102 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1103 "pxor %%mm6, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1104 "pxor %%mm7, %%mm3 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1105 "pand %%mm2, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1106 "pand %%mm3, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1107 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 #ifdef HAVE_MMX2 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1109 "pminsw %%mm0, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1110 "pminsw %%mm1, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1111 #else |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1112 "movq %%mm4, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1113 "psubusw %%mm0, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1114 "psubw %%mm2, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1115 "movq %%mm5, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1116 "psubusw %%mm1, %%mm2 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1117 "psubw %%mm2, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1118 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1119 "pxor %%mm6, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1120 "pxor %%mm7, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1121 "psubw %%mm6, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1122 "psubw %%mm7, %%mm5 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1123 "packsswb %%mm5, %%mm4 \n\t" |
810 | 1124 "movq (%0), %%mm0 \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1125 "paddb %%mm4, %%mm0 \n\t" |
810 | 1126 "movq %%mm0, (%0) \n\t" |
1127 "movq (%0, %1), %%mm0 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1128 "psubb %%mm4, %%mm0 \n\t" |
810 | 1129 "movq %%mm0, (%0, %1) \n\t" |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1130 |
810 | 1131 : "+r" (src) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1132 : "r" ((long)stride), "m" (c->pQPb) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1133 : "%"REG_a, "%"REG_c |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1134 ); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1135 #else |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1136 const int l1= stride; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1137 const int l2= stride + l1; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1138 const int l3= stride + l2; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1139 const int l4= stride + l3; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1140 const int l5= stride + l4; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1141 const int l6= stride + l5; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1142 const int l7= stride + l6; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1143 const int l8= stride + l7; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1144 // const int l9= stride + l8; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1145 int x; |
111 | 1146 src+= stride*3; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1147 for(x=0; x<BLOCK_SIZE; x++) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1148 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1149 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
787 | 1150 if(ABS(middleEnergy) < 8*c->QP) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1151 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1152 const int q=(src[l4] - src[l5])/2; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1153 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1154 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1155 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1156 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1157 d= MAX(d, 0); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1158 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1159 d= (5*d + 32) >> 6; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1160 d*= SIGN(-middleEnergy); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1161 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1162 if(q>0) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1163 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1164 d= d<0 ? 0 : d; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1165 d= d>q ? q : d; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1166 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1167 else |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1168 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1169 d= d>0 ? 0 : d; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1170 d= d<q ? q : d; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1171 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1172 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1173 src[l4]-= d; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1174 src[l5]+= d; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1175 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1176 src++; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1177 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1178 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1179 } |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1180 #endif //HAVE_ALTIVEC |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1181 |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1182 #ifndef HAVE_ALTIVEC |
787 | 1183 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1184 { |
132 | 1185 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1186 asm volatile( |
787 | 1187 "pxor %%mm6, %%mm6 \n\t" |
1188 "pcmpeqb %%mm7, %%mm7 \n\t" | |
1189 "movq %2, %%mm0 \n\t" | |
1190 "punpcklbw %%mm6, %%mm0 \n\t" | |
1191 "psrlw $1, %%mm0 \n\t" | |
1192 "psubw %%mm7, %%mm0 \n\t" | |
1193 "packuswb %%mm0, %%mm0 \n\t" | |
1194 "movq %%mm0, %3 \n\t" | |
130 | 1195 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1196 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1197 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
787 | 1198 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1199 // 0 1 2 3 4 5 6 7 8 9 |
787 | 1200 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1201 |
169 | 1202 #undef FIND_MIN_MAX |
132 | 1203 #ifdef HAVE_MMX2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1204 #define REAL_FIND_MIN_MAX(addr)\ |
130 | 1205 "movq " #addr ", %%mm0 \n\t"\ |
167 | 1206 "pminub %%mm0, %%mm7 \n\t"\ |
1207 "pmaxub %%mm0, %%mm6 \n\t" | |
132 | 1208 #else |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1209 #define REAL_FIND_MIN_MAX(addr)\ |
132 | 1210 "movq " #addr ", %%mm0 \n\t"\ |
167 | 1211 "movq %%mm7, %%mm1 \n\t"\ |
1212 "psubusb %%mm0, %%mm6 \n\t"\ | |
1213 "paddb %%mm0, %%mm6 \n\t"\ | |
132 | 1214 "psubusb %%mm0, %%mm1 \n\t"\ |
167 | 1215 "psubb %%mm1, %%mm7 \n\t" |
132 | 1216 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1217 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1218 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1219 FIND_MIN_MAX((%%REGa)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1220 FIND_MIN_MAX((%%REGa, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1221 FIND_MIN_MAX((%%REGa, %1, 2)) |
130 | 1222 FIND_MIN_MAX((%0, %1, 4)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1223 FIND_MIN_MAX((%%REGd)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1224 FIND_MIN_MAX((%%REGd, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1225 FIND_MIN_MAX((%%REGd, %1, 2)) |
130 | 1226 FIND_MIN_MAX((%0, %1, 8)) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1227 |
167 | 1228 "movq %%mm7, %%mm4 \n\t" |
1229 "psrlq $8, %%mm7 \n\t" | |
1230 #ifdef HAVE_MMX2 | |
1231 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1232 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
1233 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
1234 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
1235 "pminub %%mm4, %%mm7 \n\t" | |
1236 #else | |
1237 "movq %%mm7, %%mm1 \n\t" | |
1238 "psubusb %%mm4, %%mm1 \n\t" | |
1239 "psubb %%mm1, %%mm7 \n\t" | |
1240 "movq %%mm7, %%mm4 \n\t" | |
1241 "psrlq $16, %%mm7 \n\t" | |
1242 "movq %%mm7, %%mm1 \n\t" | |
1243 "psubusb %%mm4, %%mm1 \n\t" | |
1244 "psubb %%mm1, %%mm7 \n\t" | |
1245 "movq %%mm7, %%mm4 \n\t" | |
1246 "psrlq $32, %%mm7 \n\t" | |
1247 "movq %%mm7, %%mm1 \n\t" | |
1248 "psubusb %%mm4, %%mm1 \n\t" | |
1249 "psubb %%mm1, %%mm7 \n\t" | |
1250 #endif | |
1251 | |
1252 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1253 "movq %%mm6, %%mm4 \n\t" |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1254 "psrlq $8, %%mm6 \n\t" |
132 | 1255 #ifdef HAVE_MMX2 |
167 | 1256 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1257 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
167 | 1258 "pmaxub %%mm4, %%mm6 \n\t" |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1259 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
167 | 1260 "pmaxub %%mm4, %%mm6 \n\t" |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1261 #else |
167 | 1262 "psubusb %%mm4, %%mm6 \n\t" |
1263 "paddb %%mm4, %%mm6 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1264 "movq %%mm6, %%mm4 \n\t" |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1265 "psrlq $16, %%mm6 \n\t" |
167 | 1266 "psubusb %%mm4, %%mm6 \n\t" |
1267 "paddb %%mm4, %%mm6 \n\t" | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1268 "movq %%mm6, %%mm4 \n\t" |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1269 "psrlq $32, %%mm6 \n\t" |
167 | 1270 "psubusb %%mm4, %%mm6 \n\t" |
1271 "paddb %%mm4, %%mm6 \n\t" | |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1272 #endif |
167 | 1273 "movq %%mm6, %%mm0 \n\t" // max |
1274 "psubb %%mm7, %%mm6 \n\t" // max - min | |
1275 "movd %%mm6, %%ecx \n\t" | |
210 | 1276 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
167 | 1277 " jb 1f \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1278 "lea -24(%%"REG_SP"), %%"REG_c" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1279 "and "ALIGN_MASK", %%"REG_c" \n\t" |
167 | 1280 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1281 "punpcklbw %%mm7, %%mm7 \n\t" |
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1282 "punpcklbw %%mm7, %%mm7 \n\t" |
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1283 "punpcklbw %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1284 "movq %%mm7, (%%"REG_c") \n\t" |
130 | 1285 |
1286 "movq (%0), %%mm0 \n\t" // L10 | |
1287 "movq %%mm0, %%mm1 \n\t" // L10 | |
1288 "movq %%mm0, %%mm2 \n\t" // L10 | |
1289 "psllq $8, %%mm1 \n\t" | |
1290 "psrlq $8, %%mm2 \n\t" | |
1291 "movd -4(%0), %%mm3 \n\t" | |
1292 "movd 8(%0), %%mm4 \n\t" | |
1293 "psrlq $24, %%mm3 \n\t" | |
1294 "psllq $56, %%mm4 \n\t" | |
1295 "por %%mm3, %%mm1 \n\t" // L00 | |
1296 "por %%mm4, %%mm2 \n\t" // L20 | |
1297 "movq %%mm1, %%mm3 \n\t" // L00 | |
1298 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
1299 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
1300 "psubusb %%mm7, %%mm0 \n\t" | |
1301 "psubusb %%mm7, %%mm2 \n\t" | |
1302 "psubusb %%mm7, %%mm3 \n\t" | |
210 | 1303 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
1304 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
1305 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
130 | 1306 "paddb %%mm2, %%mm0 \n\t" |
1307 "paddb %%mm3, %%mm0 \n\t" | |
1308 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1309 "movq (%%"REG_a"), %%mm2 \n\t" // L11 |
130 | 1310 "movq %%mm2, %%mm3 \n\t" // L11 |
1311 "movq %%mm2, %%mm4 \n\t" // L11 | |
1312 "psllq $8, %%mm3 \n\t" | |
1313 "psrlq $8, %%mm4 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1314 "movd -4(%%"REG_a"), %%mm5 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1315 "movd 8(%%"REG_a"), %%mm6 \n\t" |
130 | 1316 "psrlq $24, %%mm5 \n\t" |
1317 "psllq $56, %%mm6 \n\t" | |
1318 "por %%mm5, %%mm3 \n\t" // L01 | |
1319 "por %%mm6, %%mm4 \n\t" // L21 | |
1320 "movq %%mm3, %%mm5 \n\t" // L01 | |
1321 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
1322 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
1323 "psubusb %%mm7, %%mm2 \n\t" | |
1324 "psubusb %%mm7, %%mm4 \n\t" | |
1325 "psubusb %%mm7, %%mm5 \n\t" | |
210 | 1326 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
1327 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
1328 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
130 | 1329 "paddb %%mm4, %%mm2 \n\t" |
1330 "paddb %%mm5, %%mm2 \n\t" | |
1331 // 0, 2, 3, 1 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1332 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
130 | 1333 "movq " #src ", " #sx " \n\t" /* src[0] */\ |
1334 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
1335 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
1336 "psllq $8, " #lx " \n\t"\ | |
1337 "psrlq $8, " #t0 " \n\t"\ | |
1338 "movd -4" #src ", " #t1 " \n\t"\ | |
1339 "psrlq $24, " #t1 " \n\t"\ | |
1340 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
1341 "movd 8" #src ", " #t1 " \n\t"\ | |
1342 "psllq $56, " #t1 " \n\t"\ | |
1343 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
1344 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
1345 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
1346 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
135 | 1347 PAVGB(lx, pplx) \ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1348 "movq " #lx ", 8(%%"REG_c") \n\t"\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1349 "movq (%%"REG_c"), " #lx " \n\t"\ |
140 | 1350 "psubusb " #lx ", " #t1 " \n\t"\ |
1351 "psubusb " #lx ", " #t0 " \n\t"\ | |
1352 "psubusb " #lx ", " #sx " \n\t"\ | |
210 | 1353 "movq "MANGLE(b00)", " #lx " \n\t"\ |
140 | 1354 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
1355 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
1356 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
130 | 1357 "paddb " #t1 ", " #t0 " \n\t"\ |
1358 "paddb " #t0 ", " #sx " \n\t"\ | |
1359 \ | |
1360 PAVGB(plx, pplx) /* filtered */\ | |
1361 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
134 | 1362 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
787 | 1363 "psubusb %3, " #t0 " \n\t"\ |
1364 "paddusb %3, " #t1 " \n\t"\ | |
134 | 1365 PMAXUB(t0, pplx)\ |
1366 PMINUB(t1, pplx, t0)\ | |
130 | 1367 "paddb " #sx ", " #ppsx " \n\t"\ |
1368 "paddb " #psx ", " #ppsx " \n\t"\ | |
210 | 1369 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
1370 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
140 | 1371 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
134 | 1372 "pand " #ppsx ", " #pplx " \n\t"\ |
130 | 1373 "pandn " #dst ", " #ppsx " \n\t"\ |
140 | 1374 "por " #pplx ", " #ppsx " \n\t"\ |
135 | 1375 "movq " #ppsx ", " #dst " \n\t"\ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1376 "movq 8(%%"REG_c"), " #lx " \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1377 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1378 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1379 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) |
130 | 1380 /* |
1381 0000000 | |
1382 1111111 | |
1383 | |
1384 1111110 | |
1385 1111101 | |
1386 1111100 | |
1387 1111011 | |
1388 1111010 | |
1389 1111001 | |
1390 | |
1391 1111000 | |
1392 1110111 | |
1393 | |
1394 */ | |
1395 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1396 DERING_CORE((%%REGa),(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1397 DERING_CORE((%%REGa, %1),(%%REGa, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1398 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1399 DERING_CORE((%0, %1, 4),(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1400 DERING_CORE((%%REGd),(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1401 DERING_CORE((%%REGd, %1), (%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1402 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1403 DERING_CORE((%0, %1, 8),(%%REGd, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1404 |
167 | 1405 "1: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1406 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1407 : "%"REG_a, "%"REG_d, "%"REG_c |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1408 ); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1409 #else |
134 | 1410 int y; |
1411 int min=255; | |
1412 int max=0; | |
1413 int avg; | |
1414 uint8_t *p; | |
1415 int s[10]; | |
787 | 1416 const int QP2= c->QP/2 + 1; |
134 | 1417 |
1418 for(y=1; y<9; y++) | |
1419 { | |
1420 int x; | |
1421 p= src + stride*y; | |
1422 for(x=1; x<9; x++) | |
1423 { | |
1424 p++; | |
1425 if(*p > max) max= *p; | |
1426 if(*p < min) min= *p; | |
1427 } | |
1428 } | |
787 | 1429 avg= (min + max + 1)>>1; |
134 | 1430 |
167 | 1431 if(max - min <deringThreshold) return; |
1432 | |
134 | 1433 for(y=0; y<10; y++) |
1434 { | |
1435 int t = 0; | |
787 | 1436 |
1437 if(src[stride*y + 0] > avg) t+= 1; | |
1438 if(src[stride*y + 1] > avg) t+= 2; | |
1439 if(src[stride*y + 2] > avg) t+= 4; | |
1440 if(src[stride*y + 3] > avg) t+= 8; | |
1441 if(src[stride*y + 4] > avg) t+= 16; | |
1442 if(src[stride*y + 5] > avg) t+= 32; | |
1443 if(src[stride*y + 6] > avg) t+= 64; | |
1444 if(src[stride*y + 7] > avg) t+= 128; | |
1445 if(src[stride*y + 8] > avg) t+= 256; | |
1446 if(src[stride*y + 9] > avg) t+= 512; | |
1447 | |
134 | 1448 t |= (~t)<<16; |
1449 t &= (t<<1) & (t>>1); | |
1450 s[y] = t; | |
1451 } | |
787 | 1452 |
1453 for(y=1; y<9; y++) | |
1454 { | |
1455 int t = s[y-1] & s[y] & s[y+1]; | |
1456 t|= t>>16; | |
1457 s[y-1]= t; | |
1458 } | |
134 | 1459 |
1460 for(y=1; y<9; y++) | |
1461 { | |
1462 int x; | |
787 | 1463 int t = s[y-1]; |
134 | 1464 |
1465 p= src + stride*y; | |
1466 for(x=1; x<9; x++) | |
1467 { | |
1468 p++; | |
1469 if(t & (1<<x)) | |
1470 { | |
1471 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
1472 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
1473 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
1474 f= (f + 8)>>4; | |
1475 | |
167 | 1476 #ifdef DEBUG_DERING_THRESHOLD |
1477 asm volatile("emms\n\t":); | |
1478 { | |
1479 static long long numPixels=0; | |
1480 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
1481 // if((max-min)<20 || (max-min)*QP<200) | |
1482 // if((max-min)*QP < 500) | |
1483 // if(max-min<QP/2) | |
1484 if(max-min < 20) | |
1485 { | |
1486 static int numSkiped=0; | |
1487 static int errorSum=0; | |
1488 static int worstQP=0; | |
1489 static int worstRange=0; | |
1490 static int worstDiff=0; | |
1491 int diff= (f - *p); | |
1492 int absDiff= ABS(diff); | |
1493 int error= diff*diff; | |
1494 | |
1495 if(x==1 || x==8 || y==1 || y==8) continue; | |
1496 | |
1497 numSkiped++; | |
1498 if(absDiff > worstDiff) | |
1499 { | |
1500 worstDiff= absDiff; | |
1501 worstQP= QP; | |
1502 worstRange= max-min; | |
1503 } | |
1504 errorSum+= error; | |
1505 | |
1506 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
1507 { | |
1508 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
1509 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
1510 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
1511 worstDiff, (float)numSkiped/numPixels); | |
1512 } | |
1513 } | |
1514 } | |
1515 #endif | |
787 | 1516 if (*p + QP2 < f) *p= *p + QP2; |
1517 else if(*p - QP2 > f) *p= *p - QP2; | |
134 | 1518 else *p=f; |
1519 } | |
1520 } | |
1521 } | |
167 | 1522 #ifdef DEBUG_DERING_THRESHOLD |
1523 if(max-min < 20) | |
1524 { | |
1525 for(y=1; y<9; y++) | |
1526 { | |
1527 int x; | |
1528 int t = 0; | |
1529 p= src + stride*y; | |
1530 for(x=1; x<9; x++) | |
1531 { | |
1532 p++; | |
1533 *p = MIN(*p + 20, 255); | |
1534 } | |
1535 } | |
1536 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
1537 } | |
1538 #endif | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1539 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1540 } |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1541 #endif //HAVE_ALTIVEC |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1542 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1543 /** |
1109 | 1544 * Deinterlaces the given block by linearly interpolating every second line. |
142 | 1545 * will be called for every 8x8 block and can read & write from line 4-15 |
1546 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1547 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1548 */ |
169 | 1549 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1550 { |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1551 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
142 | 1552 src+= 4*stride; |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1553 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1554 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1555 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1556 // 0 1 2 3 4 5 6 7 8 9 |
787 | 1557 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1558 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1559 "movq (%0), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1560 "movq (%%"REG_a", %1), %%mm1 \n\t" |
111 | 1561 PAVGB(%%mm1, %%mm0) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1562 "movq %%mm0, (%%"REG_a") \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1563 "movq (%0, %1, 4), %%mm0 \n\t" |
111 | 1564 PAVGB(%%mm0, %%mm1) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1565 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1566 "movq (%%"REG_c", %1), %%mm1 \n\t" |
111 | 1567 PAVGB(%%mm1, %%mm0) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1568 "movq %%mm0, (%%"REG_c") \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1569 "movq (%0, %1, 8), %%mm0 \n\t" |
111 | 1570 PAVGB(%%mm0, %%mm1) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1571 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1572 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1573 : : "r" (src), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1574 : "%"REG_a, "%"REG_c |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1575 ); |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1576 #else |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1577 int a, b, x; |
142 | 1578 src+= 4*stride; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1579 |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1580 for(x=0; x<2; x++){ |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1581 a= *(uint32_t*)&src[stride*0]; |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1582 b= *(uint32_t*)&src[stride*2]; |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1583 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1584 a= *(uint32_t*)&src[stride*4]; |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1585 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1586 b= *(uint32_t*)&src[stride*6]; |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1587 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1588 a= *(uint32_t*)&src[stride*8]; |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1589 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1590 src += 4; |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1591 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1592 #endif |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1593 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1594 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1595 /** |
1109 | 1596 * Deinterlaces the given block by cubic interpolating every second line. |
142 | 1597 * will be called for every 8x8 block and can read & write from line 4-15 |
1598 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1599 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
1600 * this filter will read lines 3-15 and write 7-13 | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1601 */ |
169 | 1602 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1603 { |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1604 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
142 | 1605 src+= stride*3; |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1606 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1607 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1608 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1609 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1610 "add %1, %%"REG_c" \n\t" |
111 | 1611 "pxor %%mm7, %%mm7 \n\t" |
1612 // 0 1 2 3 4 5 6 7 8 9 10 | |
787 | 1613 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1614 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1615 #define REAL_DEINT_CUBIC(a,b,c,d,e)\ |
111 | 1616 "movq " #a ", %%mm0 \n\t"\ |
1617 "movq " #b ", %%mm1 \n\t"\ | |
1618 "movq " #d ", %%mm2 \n\t"\ | |
1619 "movq " #e ", %%mm3 \n\t"\ | |
1620 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
1621 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
1622 "movq %%mm0, %%mm2 \n\t"\ | |
1623 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1624 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
1625 "movq %%mm1, %%mm3 \n\t"\ | |
1626 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1627 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1628 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
1629 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
1630 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
1631 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
1632 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
1633 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
1634 "packuswb %%mm3, %%mm1 \n\t"\ | |
1635 "movq %%mm1, " #c " \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1636 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1637 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1638 DEINT_CUBIC((%0), (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1639 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4), (%%REGd), (%%REGd, %1), (%0, %1, 8)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1640 DEINT_CUBIC((%0, %1, 4), (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGc)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1641 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8), (%%REGd, %1, 4), (%%REGc), (%%REGc, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1642 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1643 : : "r" (src), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1644 : "%"REG_a, "%"REG_d, "%"REG_c |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1645 ); |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1646 #else |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1647 int x; |
142 | 1648 src+= stride*3; |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1649 for(x=0; x<8; x++) |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1650 { |
1157 | 1651 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
1652 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | |
1653 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | |
1654 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1655 src++; |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1656 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1657 #endif |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1658 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1659 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1660 /** |
1109 | 1661 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
142 | 1662 * will be called for every 8x8 block and can read & write from line 4-15 |
1663 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1664 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
787 | 1665 * this filter will read lines 4-13 and write 5-11 |
1666 */ | |
1667 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
1668 { | |
1669 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1670 src+= stride*4; | |
1671 asm volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1672 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1673 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
787 | 1674 "pxor %%mm7, %%mm7 \n\t" |
1675 "movq (%2), %%mm0 \n\t" | |
1676 // 0 1 2 3 4 5 6 7 8 9 10 | |
1677 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
1678 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1679 #define REAL_DEINT_FF(a,b,c,d)\ |
787 | 1680 "movq " #a ", %%mm1 \n\t"\ |
1681 "movq " #b ", %%mm2 \n\t"\ | |
1682 "movq " #c ", %%mm3 \n\t"\ | |
1683 "movq " #d ", %%mm4 \n\t"\ | |
1684 PAVGB(%%mm3, %%mm1) \ | |
1685 PAVGB(%%mm4, %%mm0) \ | |
1686 "movq %%mm0, %%mm3 \n\t"\ | |
1687 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
1688 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
1689 "movq %%mm1, %%mm4 \n\t"\ | |
1690 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
1691 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
1692 "psllw $2, %%mm1 \n\t"\ | |
1693 "psllw $2, %%mm4 \n\t"\ | |
1694 "psubw %%mm0, %%mm1 \n\t"\ | |
1695 "psubw %%mm3, %%mm4 \n\t"\ | |
1696 "movq %%mm2, %%mm5 \n\t"\ | |
1697 "movq %%mm2, %%mm0 \n\t"\ | |
1698 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1699 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1700 "paddw %%mm2, %%mm1 \n\t"\ | |
1701 "paddw %%mm5, %%mm4 \n\t"\ | |
1702 "psraw $2, %%mm1 \n\t"\ | |
1703 "psraw $2, %%mm4 \n\t"\ | |
1704 "packuswb %%mm4, %%mm1 \n\t"\ | |
1705 "movq %%mm1, " #b " \n\t"\ | |
1706 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1707 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1708 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1709 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1710 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1711 DEINT_FF((%0, %1, 4), (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1712 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGd, %1, 4)) |
787 | 1713 |
1714 "movq %%mm0, (%2) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1715 : : "r" (src), "r" ((long)stride), "r"(tmp) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1716 : "%"REG_a, "%"REG_d |
787 | 1717 ); |
1718 #else | |
1719 int x; | |
1720 src+= stride*4; | |
1721 for(x=0; x<8; x++) | |
1722 { | |
1723 int t1= tmp[x]; | |
1724 int t2= src[stride*1]; | |
1725 | |
1157 | 1726 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); |
787 | 1727 t1= src[stride*4]; |
1157 | 1728 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); |
787 | 1729 t2= src[stride*6]; |
1157 | 1730 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); |
787 | 1731 t1= src[stride*8]; |
1157 | 1732 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
787 | 1733 tmp[x]= t1; |
1734 | |
1735 src++; | |
1736 } | |
1737 #endif | |
1738 } | |
1739 | |
1740 /** | |
1157 | 1741 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
1742 * will be called for every 8x8 block and can read & write from line 4-15 | |
1743 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1744 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
1745 * this filter will read lines 4-13 and write 4-11 | |
1746 */ | |
1747 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | |
1748 { | |
1749 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
1750 src+= stride*4; | |
1751 asm volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1752 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1753 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
1157 | 1754 "pxor %%mm7, %%mm7 \n\t" |
1755 "movq (%2), %%mm0 \n\t" | |
1756 "movq (%3), %%mm1 \n\t" | |
1757 // 0 1 2 3 4 5 6 7 8 9 10 | |
1758 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
1759 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1760 #define REAL_DEINT_L5(t1,t2,a,b,c)\ |
1157 | 1761 "movq " #a ", %%mm2 \n\t"\ |
1762 "movq " #b ", %%mm3 \n\t"\ | |
1763 "movq " #c ", %%mm4 \n\t"\ | |
1764 PAVGB(t2, %%mm3) \ | |
1765 PAVGB(t1, %%mm4) \ | |
1766 "movq %%mm2, %%mm5 \n\t"\ | |
1767 "movq %%mm2, " #t1 " \n\t"\ | |
1768 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
1769 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
1770 "movq %%mm2, %%mm6 \n\t"\ | |
1771 "paddw %%mm2, %%mm2 \n\t"\ | |
1772 "paddw %%mm6, %%mm2 \n\t"\ | |
1773 "movq %%mm5, %%mm6 \n\t"\ | |
1774 "paddw %%mm5, %%mm5 \n\t"\ | |
1775 "paddw %%mm6, %%mm5 \n\t"\ | |
1776 "movq %%mm3, %%mm6 \n\t"\ | |
1777 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
1778 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
1779 "paddw %%mm3, %%mm3 \n\t"\ | |
1780 "paddw %%mm6, %%mm6 \n\t"\ | |
1781 "paddw %%mm3, %%mm2 \n\t"\ | |
1782 "paddw %%mm6, %%mm5 \n\t"\ | |
1783 "movq %%mm4, %%mm6 \n\t"\ | |
1784 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
1785 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
1786 "psubw %%mm4, %%mm2 \n\t"\ | |
1787 "psubw %%mm6, %%mm5 \n\t"\ | |
1788 "psraw $2, %%mm2 \n\t"\ | |
1789 "psraw $2, %%mm5 \n\t"\ | |
1790 "packuswb %%mm5, %%mm2 \n\t"\ | |
1791 "movq %%mm2, " #a " \n\t"\ | |
1792 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1793 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1794 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1795 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1796 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1797 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1798 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1799 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1800 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1801 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1802 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) |
1157 | 1803 |
1804 "movq %%mm0, (%2) \n\t" | |
1805 "movq %%mm1, (%3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1806 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1807 : "%"REG_a, "%"REG_d |
1157 | 1808 ); |
1809 #else | |
1810 int x; | |
1811 src+= stride*4; | |
1812 for(x=0; x<8; x++) | |
1813 { | |
1814 int t1= tmp[x]; | |
1815 int t2= tmp2[x]; | |
1816 int t3= src[0]; | |
1817 | |
1818 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); | |
1819 t1= src[stride*1]; | |
1820 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); | |
1821 t2= src[stride*2]; | |
1822 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); | |
1823 t3= src[stride*3]; | |
1824 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); | |
1825 t1= src[stride*4]; | |
1826 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); | |
1827 t2= src[stride*5]; | |
1828 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); | |
1829 t3= src[stride*6]; | |
1830 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); | |
1831 t1= src[stride*7]; | |
1832 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); | |
1833 | |
1834 tmp[x]= t3; | |
1835 tmp2[x]= t1; | |
1836 | |
1837 src++; | |
1838 } | |
1839 #endif | |
1840 } | |
1841 | |
1842 /** | |
1109 | 1843 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
787 | 1844 * will be called for every 8x8 block and can read & write from line 4-15 |
1845 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1846 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
142 | 1847 * this filter will read lines 4-13 and write 4-11 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1848 */ |
1581 | 1849 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1850 { |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1851 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
142 | 1852 src+= 4*stride; |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1853 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1854 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1855 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1856 // 0 1 2 3 4 5 6 7 8 9 |
787 | 1857 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1858 |
1581 | 1859 "movq (%2), %%mm0 \n\t" // L0 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1860 "movq (%%"REG_a"), %%mm1 \n\t" // L2 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1861 PAVGB(%%mm1, %%mm0) // L0+L2 |
1581 | 1862 "movq (%0), %%mm2 \n\t" // L1 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1863 PAVGB(%%mm2, %%mm0) |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1864 "movq %%mm0, (%0) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1865 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1866 PAVGB(%%mm0, %%mm2) // L1+L3 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1867 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1868 "movq %%mm2, (%%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1869 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1870 PAVGB(%%mm2, %%mm1) // L2+L4 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1871 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1872 "movq %%mm1, (%%"REG_a", %1) \n\t" |
1581 | 1873 "movq (%0, %1, 4), %%mm1 \n\t" // L5 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1874 PAVGB(%%mm1, %%mm0) // L3+L5 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1875 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1876 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1877 "movq (%%"REG_d"), %%mm0 \n\t" // L6 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1878 PAVGB(%%mm0, %%mm2) // L4+L6 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1879 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1880 "movq %%mm2, (%0, %1, 4) \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1881 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1882 PAVGB(%%mm2, %%mm1) // L5+L7 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1883 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1884 "movq %%mm1, (%%"REG_d") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1885 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1886 PAVGB(%%mm1, %%mm0) // L6+L8 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1887 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1888 "movq %%mm0, (%%"REG_d", %1) \n\t" |
1581 | 1889 "movq (%0, %1, 8), %%mm0 \n\t" // L9 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1890 PAVGB(%%mm0, %%mm2) // L7+L9 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1891 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1892 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" |
1581 | 1893 "movq %%mm1, (%2) \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1894 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1895 : : "r" (src), "r" ((long)stride), "r" (tmp) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1896 : "%"REG_a, "%"REG_d |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1897 ); |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1898 #else |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1899 int a, b, c, x; |
142 | 1900 src+= 4*stride; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1901 |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1902 for(x=0; x<2; x++){ |
1581 | 1903 a= *(uint32_t*)&tmp[stride*0]; |
1904 b= *(uint32_t*)&src[stride*0]; | |
1905 c= *(uint32_t*)&src[stride*1]; | |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1906 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1907 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1908 |
1581 | 1909 a= *(uint32_t*)&src[stride*2]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1910 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1911 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1912 |
1581 | 1913 b= *(uint32_t*)&src[stride*3]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1914 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1915 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1916 |
1581 | 1917 c= *(uint32_t*)&src[stride*4]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1918 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1919 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1920 |
1581 | 1921 a= *(uint32_t*)&src[stride*5]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1922 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1923 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1924 |
1581 | 1925 b= *(uint32_t*)&src[stride*6]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1926 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1927 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1928 |
1581 | 1929 c= *(uint32_t*)&src[stride*7]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1930 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1931 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1932 |
1581 | 1933 a= *(uint32_t*)&src[stride*8]; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1934 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1935 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1936 |
1581 | 1937 *(uint32_t*)&tmp[stride*0]= c; |
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1938 src += 4; |
1581 | 1939 tmp += 4; |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1940 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1941 #endif |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1942 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1943 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1944 /** |
1109 | 1945 * Deinterlaces the given block by applying a median filter to every second line. |
142 | 1946 * will be called for every 8x8 block and can read & write from line 4-15, |
1947 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
1948 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1949 */ |
169 | 1950 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1951 { |
107 | 1952 #ifdef HAVE_MMX |
142 | 1953 src+= 4*stride; |
107 | 1954 #ifdef HAVE_MMX2 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1955 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1956 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1957 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1958 // 0 1 2 3 4 5 6 7 8 9 |
787 | 1959 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1960 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1961 "movq (%0), %%mm0 \n\t" // |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1962 "movq (%%"REG_a", %1), %%mm2 \n\t" // |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1963 "movq (%%"REG_a"), %%mm1 \n\t" // |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1964 "movq %%mm0, %%mm3 \n\t" |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1965 "pmaxub %%mm1, %%mm0 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1966 "pminub %%mm3, %%mm1 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1967 "pmaxub %%mm2, %%mm1 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1968 "pminub %%mm1, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1969 "movq %%mm0, (%%"REG_a") \n\t" |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1970 |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1971 "movq (%0, %1, 4), %%mm0 \n\t" // |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1972 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1973 "movq %%mm2, %%mm3 \n\t" |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1974 "pmaxub %%mm1, %%mm2 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1975 "pminub %%mm3, %%mm1 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1976 "pmaxub %%mm0, %%mm1 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1977 "pminub %%mm1, %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1978 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1979 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1980 "movq (%%"REG_d"), %%mm2 \n\t" // |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1981 "movq (%%"REG_d", %1), %%mm1 \n\t" // |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1982 "movq %%mm2, %%mm3 \n\t" |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1983 "pmaxub %%mm0, %%mm2 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1984 "pminub %%mm3, %%mm0 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1985 "pmaxub %%mm1, %%mm0 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1986 "pminub %%mm0, %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1987 "movq %%mm2, (%%"REG_d") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1988 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1989 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1990 "movq (%0, %1, 8), %%mm0 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1991 "movq %%mm2, %%mm3 \n\t" |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1992 "pmaxub %%mm0, %%mm2 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1993 "pminub %%mm3, %%mm0 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1994 "pmaxub %%mm1, %%mm0 \n\t" // |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1995 "pminub %%mm0, %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1996 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1997 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1998 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
1999 : : "r" (src), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2000 : "%"REG_a, "%"REG_d |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2001 ); |
107 | 2002 |
2003 #else // MMX without MMX2 | |
2004 asm volatile( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2005 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2006 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
107 | 2007 // 0 1 2 3 4 5 6 7 8 9 |
787 | 2008 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
107 | 2009 "pxor %%mm7, %%mm7 \n\t" |
2010 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2011 #define REAL_MEDIAN(a,b,c)\ |
107 | 2012 "movq " #a ", %%mm0 \n\t"\ |
2013 "movq " #b ", %%mm2 \n\t"\ | |
2014 "movq " #c ", %%mm1 \n\t"\ | |
2015 "movq %%mm0, %%mm3 \n\t"\ | |
2016 "movq %%mm1, %%mm4 \n\t"\ | |
2017 "movq %%mm2, %%mm5 \n\t"\ | |
2018 "psubusb %%mm1, %%mm3 \n\t"\ | |
2019 "psubusb %%mm2, %%mm4 \n\t"\ | |
2020 "psubusb %%mm0, %%mm5 \n\t"\ | |
2021 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
2022 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
2023 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
2024 "movq %%mm3, %%mm6 \n\t"\ | |
2025 "pxor %%mm4, %%mm3 \n\t"\ | |
2026 "pxor %%mm5, %%mm4 \n\t"\ | |
2027 "pxor %%mm6, %%mm5 \n\t"\ | |
2028 "por %%mm3, %%mm1 \n\t"\ | |
2029 "por %%mm4, %%mm2 \n\t"\ | |
2030 "por %%mm5, %%mm0 \n\t"\ | |
2031 "pand %%mm2, %%mm0 \n\t"\ | |
2032 "pand %%mm1, %%mm0 \n\t"\ | |
2033 "movq %%mm0, " #b " \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2034 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2035 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2036 MEDIAN((%0), (%%REGa), (%%REGa, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2037 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2038 MEDIAN((%0, %1, 4), (%%REGd), (%%REGd, %1)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2039 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2040 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2041 : : "r" (src), "r" ((long)stride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2042 : "%"REG_a, "%"REG_d |
107 | 2043 ); |
2044 #endif // MMX | |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2045 #else |
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2046 int x, y; |
142 | 2047 src+= 4*stride; |
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2048 // FIXME - there should be a way to do a few columns in parallel like w/mmx |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2049 for(x=0; x<8; x++) |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2050 { |
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2051 uint8_t *colsrc = src; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2052 for (y=0; y<4; y++) |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2053 { |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2054 int a, b, c, d, e, f; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2055 a = colsrc[0 ]; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2056 b = colsrc[stride ]; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2057 c = colsrc[stride*2]; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2058 d = (a-b)>>31; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2059 e = (b-c)>>31; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2060 f = (c-a)>>31; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2061 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2062 colsrc += stride*2; |
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2063 } |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2064 src++; |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2065 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2066 #endif |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2067 } |
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2068 |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2069 #ifdef HAVE_MMX |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2070 /** |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2071 * transposes and shift the given 8x8 Block into dst1 and dst2 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2072 */ |
169 | 2073 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2074 { |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2075 asm( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2076 "lea (%0, %1), %%"REG_a" \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2077 // 0 1 2 3 4 5 6 7 8 9 |
787 | 2078 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2079 "movq (%0), %%mm0 \n\t" // 12345678 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2080 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2081 "movq %%mm0, %%mm2 \n\t" // 12345678 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2082 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2083 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2084 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2085 "movq (%%"REG_a", %1), %%mm1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2086 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2087 "movq %%mm1, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2088 "punpcklbw %%mm3, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2089 "punpckhbw %%mm3, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2090 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2091 "movq %%mm0, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2092 "punpcklwd %%mm1, %%mm0 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2093 "punpckhwd %%mm1, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2094 "movq %%mm2, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2095 "punpcklwd %%mm4, %%mm2 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2096 "punpckhwd %%mm4, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2097 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2098 "movd %%mm0, 128(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2099 "psrlq $32, %%mm0 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2100 "movd %%mm0, 144(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2101 "movd %%mm3, 160(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2102 "psrlq $32, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2103 "movd %%mm3, 176(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2104 "movd %%mm3, 48(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2105 "movd %%mm2, 192(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2106 "movd %%mm2, 64(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2107 "psrlq $32, %%mm2 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2108 "movd %%mm2, 80(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2109 "movd %%mm1, 96(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2110 "psrlq $32, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2111 "movd %%mm1, 112(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2112 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2113 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" |
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2114 |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2115 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2116 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2117 "movq %%mm0, %%mm2 \n\t" // 12345678 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2118 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2119 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2120 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2121 "movq (%%"REG_a", %1), %%mm1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2122 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2123 "movq %%mm1, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2124 "punpcklbw %%mm3, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2125 "punpckhbw %%mm3, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2126 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2127 "movq %%mm0, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2128 "punpcklwd %%mm1, %%mm0 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2129 "punpckhwd %%mm1, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2130 "movq %%mm2, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2131 "punpcklwd %%mm4, %%mm2 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2132 "punpckhwd %%mm4, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2133 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2134 "movd %%mm0, 132(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2135 "psrlq $32, %%mm0 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2136 "movd %%mm0, 148(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2137 "movd %%mm3, 164(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2138 "psrlq $32, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2139 "movd %%mm3, 180(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2140 "movd %%mm3, 52(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2141 "movd %%mm2, 196(%2) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2142 "movd %%mm2, 68(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2143 "psrlq $32, %%mm2 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2144 "movd %%mm2, 84(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2145 "movd %%mm1, 100(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2146 "psrlq $32, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2147 "movd %%mm1, 116(%3) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2148 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2149 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2150 :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2151 : "%"REG_a |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2152 ); |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2153 } |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2154 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2155 /** |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2156 * transposes the given 8x8 block |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2157 */ |
169 | 2158 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2159 { |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2160 asm( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2161 "lea (%0, %1), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2162 "lea (%%"REG_a",%1,4), %%"REG_d"\n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2163 // 0 1 2 3 4 5 6 7 8 9 |
787 | 2164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2165 "movq (%2), %%mm0 \n\t" // 12345678 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2166 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2167 "movq %%mm0, %%mm2 \n\t" // 12345678 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2168 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2169 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2170 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2171 "movq 32(%2), %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2172 "movq 48(%2), %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2173 "movq %%mm1, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2174 "punpcklbw %%mm3, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2175 "punpckhbw %%mm3, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2176 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2177 "movq %%mm0, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2178 "punpcklwd %%mm1, %%mm0 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2179 "punpckhwd %%mm1, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2180 "movq %%mm2, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2181 "punpcklwd %%mm4, %%mm2 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2182 "punpckhwd %%mm4, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2183 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2184 "movd %%mm0, (%0) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2185 "psrlq $32, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2186 "movd %%mm0, (%%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2187 "movd %%mm3, (%%"REG_a", %1) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2188 "psrlq $32, %%mm3 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2189 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2190 "movd %%mm2, (%0, %1, 4) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2191 "psrlq $32, %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2192 "movd %%mm2, (%%"REG_d") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2193 "movd %%mm1, (%%"REG_d", %1) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2194 "psrlq $32, %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2195 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2196 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2197 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2198 "movq 64(%2), %%mm0 \n\t" // 12345678 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2199 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2200 "movq %%mm0, %%mm2 \n\t" // 12345678 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2201 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2202 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2203 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2204 "movq 96(%2), %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2205 "movq 112(%2), %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2206 "movq %%mm1, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2207 "punpcklbw %%mm3, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2208 "punpckhbw %%mm3, %%mm4 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2209 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2210 "movq %%mm0, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2211 "punpcklwd %%mm1, %%mm0 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2212 "punpckhwd %%mm1, %%mm3 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2213 "movq %%mm2, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2214 "punpcklwd %%mm4, %%mm2 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2215 "punpckhwd %%mm4, %%mm1 \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2216 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2217 "movd %%mm0, 4(%0) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2218 "psrlq $32, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2219 "movd %%mm0, 4(%%"REG_a") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2220 "movd %%mm3, 4(%%"REG_a", %1) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2221 "psrlq $32, %%mm3 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2222 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2223 "movd %%mm2, 4(%0, %1, 4) \n\t" |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2224 "psrlq $32, %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2225 "movd %%mm2, 4(%%"REG_d") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2226 "movd %%mm1, 4(%%"REG_d", %1) \n\t" |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2227 "psrlq $32, %%mm1 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2228 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2229 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2230 :: "r" (dst), "r" ((long)dstStride), "r" (src) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2231 : "%"REG_a, "%"REG_d |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2232 ); |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2233 } |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2234 #endif |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2235 //static long test=0; |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2236 |
2041 | 2237 #ifndef HAVE_ALTIVEC |
943 | 2238 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
158 | 2239 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
156 | 2240 { |
787 | 2241 // to save a register (FIXME do this outside of the loops) |
2242 tempBluredPast[127]= maxNoise[0]; | |
2243 tempBluredPast[128]= maxNoise[1]; | |
2244 tempBluredPast[129]= maxNoise[2]; | |
2245 | |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2246 #define FAST_L2_DIFF |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2247 //#define L1_DIFF //u should change the thresholds too if u try that one |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2248 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2249 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2250 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2251 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2252 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2253 // 0 1 2 3 4 5 6 7 8 9 |
787 | 2254 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2255 //FIXME reorder? |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2256 #ifdef L1_DIFF //needs mmx2 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2257 "movq (%0), %%mm0 \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2258 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2259 "movq (%0, %2), %%mm1 \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2260 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2261 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2262 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2263 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2264 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2265 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2266 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2267 "paddw %%mm1, %%mm0 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2268 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2269 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2270 "paddw %%mm2, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2271 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2272 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2273 "paddw %%mm3, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2274 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2275 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2276 "paddw %%mm4, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2277 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2278 "paddw %%mm5, %%mm6 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2279 "paddw %%mm7, %%mm6 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2280 "paddw %%mm6, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2281 #else |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2282 #if defined (FAST_L2_DIFF) |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2283 "pcmpeqb %%mm7, %%mm7 \n\t" |
210 | 2284 "movq "MANGLE(b80)", %%mm6 \n\t" |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2285 "pxor %%mm0, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2286 #define REAL_L2_DIFF_CORE(a, b)\ |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2287 "movq " #a ", %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2288 "movq " #b ", %%mm2 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2289 "pxor %%mm7, %%mm2 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2290 PAVGB(%%mm2, %%mm5)\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2291 "paddb %%mm6, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2292 "movq %%mm5, %%mm2 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2293 "psllw $8, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2294 "pmaddwd %%mm5, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2295 "pmaddwd %%mm2, %%mm2 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2296 "paddd %%mm2, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2297 "psrld $14, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2298 "paddd %%mm5, %%mm0 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2299 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2300 #else |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2301 "pxor %%mm7, %%mm7 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2302 "pxor %%mm0, %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2303 #define REAL_L2_DIFF_CORE(a, b)\ |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2304 "movq " #a ", %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2305 "movq " #b ", %%mm2 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2306 "movq %%mm5, %%mm1 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2307 "movq %%mm2, %%mm3 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2308 "punpcklbw %%mm7, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2309 "punpckhbw %%mm7, %%mm1 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2310 "punpcklbw %%mm7, %%mm2 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2311 "punpckhbw %%mm7, %%mm3 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2312 "psubw %%mm2, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2313 "psubw %%mm3, %%mm1 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2314 "pmaddwd %%mm5, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2315 "pmaddwd %%mm1, %%mm1 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2316 "paddd %%mm1, %%mm5 \n\t"\ |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2317 "paddd %%mm5, %%mm0 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2318 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2319 #endif |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2320 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2321 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2322 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2323 L2_DIFF_CORE((%0), (%1)) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2324 L2_DIFF_CORE((%0, %2), (%1, %2)) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2325 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2326 L2_DIFF_CORE((%0, %%REGa), (%1, %%REGa)) |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2327 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2328 L2_DIFF_CORE((%0, %%REGd), (%1, %%REGd)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2329 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2330 L2_DIFF_CORE((%0, %%REGc), (%1, %%REGc)) |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2331 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2332 #endif |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2333 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2334 "movq %%mm0, %%mm4 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2335 "psrlq $32, %%mm0 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2336 "paddd %%mm0, %%mm4 \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2337 "movd %%mm4, %%ecx \n\t" |
158 | 2338 "shll $2, %%ecx \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2339 "mov %3, %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2340 "addl -4(%%"REG_d"), %%ecx \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2341 "addl 4(%%"REG_d"), %%ecx \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2342 "addl -1024(%%"REG_d"), %%ecx \n\t" |
158 | 2343 "addl $4, %%ecx \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2344 "addl 1024(%%"REG_d"), %%ecx \n\t" |
158 | 2345 "shrl $3, %%ecx \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2346 "movl %%ecx, (%%"REG_d") \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2347 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2348 // "mov %3, %%"REG_c" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2349 // "mov %%"REG_c", test \n\t" |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2350 // "jmp 4f \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2351 "cmpl 512(%%"REG_d"), %%ecx \n\t" |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2352 " jb 2f \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2353 "cmpl 516(%%"REG_d"), %%ecx \n\t" |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2354 " jb 1f \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2355 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2356 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2357 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2358 "movq (%0), %%mm0 \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2359 "movq (%0, %2), %%mm1 \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2360 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2361 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2362 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2363 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2364 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2365 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2366 "movq %%mm0, (%1) \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2367 "movq %%mm1, (%1, %2) \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2368 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2369 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2370 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2371 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2372 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2373 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2374 "jmp 4f \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2375 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2376 "1: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2377 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2378 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2379 "movq (%0), %%mm0 \n\t" // L0 |
363
ff766a367974
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2380 PAVGB((%1), %%mm0) // L0 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2381 "movq (%0, %2), %%mm1 \n\t" // L1 |
363
ff766a367974
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2382 PAVGB((%1, %2), %%mm1) // L1 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2383 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
363
ff766a367974
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2384 PAVGB((%1, %2, 2), %%mm2) // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2385 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2386 PAVGB((%1, %%REGa), %%mm3) // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2387 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
363
ff766a367974
3dnow temporal denoiser bugfix by R«±mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2388 PAVGB((%1, %2, 4), %%mm4) // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2389 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2390 PAVGB((%1, %%REGd), %%mm5) // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2391 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2392 PAVGB((%1, %%REGa, 2), %%mm6) // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2393 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2394 PAVGB((%1, %%REGc), %%mm7) // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2395 "movq %%mm0, (%1) \n\t" // R0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2396 "movq %%mm1, (%1, %2) \n\t" // R1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2397 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2398 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2399 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2400 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2401 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2402 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2403 "movq %%mm0, (%0) \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2404 "movq %%mm1, (%0, %2) \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2405 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2406 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2407 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2408 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2409 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2410 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2411 "jmp 4f \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2412 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2413 "2: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2414 "cmpl 508(%%"REG_d"), %%ecx \n\t" |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2415 " jb 3f \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2416 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2417 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2418 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2419 "movq (%0), %%mm0 \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2420 "movq (%0, %2), %%mm1 \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2421 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2422 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2423 "movq (%1), %%mm4 \n\t" // R0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2424 "movq (%1, %2), %%mm5 \n\t" // R1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2425 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2426 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2427 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2428 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2429 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2430 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2431 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2432 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2433 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2434 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2435 "movq %%mm0, (%1) \n\t" // R0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2436 "movq %%mm1, (%1, %2) \n\t" // R1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2437 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2438 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2439 "movq %%mm0, (%0) \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2440 "movq %%mm1, (%0, %2) \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2441 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2442 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2443 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2444 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2445 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2446 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2447 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2448 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2449 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2450 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2451 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2452 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2453 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2454 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2455 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2456 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2457 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2458 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2459 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2460 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2461 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2462 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2463 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2464 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2465 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2466 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2467 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2468 "jmp 4f \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2469 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2470 "3: \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2471 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2472 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2473 "movq (%0), %%mm0 \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2474 "movq (%0, %2), %%mm1 \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2475 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2476 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2477 "movq (%1), %%mm4 \n\t" // R0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2478 "movq (%1, %2), %%mm5 \n\t" // R1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2479 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2480 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2481 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2482 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2483 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2484 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2485 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2486 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2487 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2488 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2489 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2490 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2491 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2492 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2493 "movq %%mm0, (%1) \n\t" // R0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2494 "movq %%mm1, (%1, %2) \n\t" // R1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2495 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2496 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2497 "movq %%mm0, (%0) \n\t" // L0 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2498 "movq %%mm1, (%0, %2) \n\t" // L1 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2499 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2500 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2501 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2502 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2503 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2504 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2505 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2506 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2507 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2508 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2509 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2510 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2511 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2512 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2513 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2514 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2515 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2516 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2517 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2518 PAVGB(%%mm4, %%mm0) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2519 PAVGB(%%mm5, %%mm1) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2520 PAVGB(%%mm6, %%mm2) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2521 PAVGB(%%mm7, %%mm3) |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2522 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2523 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2524 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2525 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2526 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2527 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2528 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2529 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2530 |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2531 "4: \n\t" |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2532 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2533 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2534 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2535 ); |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2536 //printf("%d\n", test); |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2537 #else |
788 | 2538 { |
156 | 2539 int y; |
2540 int d=0; | |
2041 | 2541 // int sysd=0; |
158 | 2542 int i; |
156 | 2543 |
2544 for(y=0; y<8; y++) | |
2545 { | |
2546 int x; | |
2547 for(x=0; x<8; x++) | |
2548 { | |
2549 int ref= tempBlured[ x + y*stride ]; | |
2550 int cur= src[ x + y*stride ]; | |
2551 int d1=ref - cur; | |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2552 // if(x==0 || x==7) d1+= d1>>1; |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2553 // if(y==0 || y==7) d1+= d1>>1; |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2554 // d+= ABS(d1); |
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2555 d+= d1*d1; |
2041 | 2556 // sysd+= d1; |
156 | 2557 } |
2558 } | |
158 | 2559 i=d; |
2560 d= ( | |
2561 4*d | |
2562 +(*(tempBluredPast-256)) | |
2563 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
2564 +(*(tempBluredPast+256)) | |
2565 +4)>>3; | |
2566 *tempBluredPast=i; | |
2567 // ((*tempBluredPast)*3 + d + 2)>>2; | |
2568 | |
156 | 2569 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
2570 /* | |
2571 Switch between | |
2572 1 0 0 0 0 0 0 (0) | |
2573 64 32 16 8 4 2 1 (1) | |
2574 64 48 36 27 20 15 11 (33) (approx) | |
2575 64 56 49 43 37 33 29 (200) (approx) | |
2576 */ | |
2577 if(d > maxNoise[1]) | |
2578 { | |
2579 if(d < maxNoise[2]) | |
2580 { | |
2581 for(y=0; y<8; y++) | |
2582 { | |
2583 int x; | |
2584 for(x=0; x<8; x++) | |
2585 { | |
2586 int ref= tempBlured[ x + y*stride ]; | |
2587 int cur= src[ x + y*stride ]; | |
2588 tempBlured[ x + y*stride ]= | |
2589 src[ x + y*stride ]= | |
2590 (ref + cur + 1)>>1; | |
2591 } | |
2592 } | |
2593 } | |
2594 else | |
2595 { | |
2596 for(y=0; y<8; y++) | |
2597 { | |
2598 int x; | |
2599 for(x=0; x<8; x++) | |
2600 { | |
2601 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
2602 } | |
2603 } | |
2604 } | |
2605 } | |
2606 else | |
2607 { | |
2608 if(d < maxNoise[0]) | |
2609 { | |
2610 for(y=0; y<8; y++) | |
2611 { | |
2612 int x; | |
2613 for(x=0; x<8; x++) | |
2614 { | |
2615 int ref= tempBlured[ x + y*stride ]; | |
2616 int cur= src[ x + y*stride ]; | |
2617 tempBlured[ x + y*stride ]= | |
2618 src[ x + y*stride ]= | |
2619 (ref*7 + cur + 4)>>3; | |
2620 } | |
2621 } | |
2622 } | |
2623 else | |
2624 { | |
2625 for(y=0; y<8; y++) | |
2626 { | |
2627 int x; | |
2628 for(x=0; x<8; x++) | |
2629 { | |
2630 int ref= tempBlured[ x + y*stride ]; | |
2631 int cur= src[ x + y*stride ]; | |
2632 tempBlured[ x + y*stride ]= | |
2633 src[ x + y*stride ]= | |
2634 (ref*3 + cur + 2)>>2; | |
2635 } | |
2636 } | |
2637 } | |
2638 } | |
788 | 2639 } |
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2640 #endif |
156 | 2641 } |
2041 | 2642 #endif //HAVE_ALTIVEC |
156 | 2643 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2644 #ifdef HAVE_MMX |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2645 /** |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2646 * accurate deblock filter |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2647 */ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2648 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2649 int64_t dc_mask, eq_mask; |
2040 | 2650 int64_t sums[10*8*2]; |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2651 src+= step*3; // src points to begin of the 8x8 Block |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2652 //START_TIMER |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2653 asm volatile( |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2654 "movq %0, %%mm7 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2655 "movq %1, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2656 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2657 ); |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2658 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2659 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2660 "lea (%2, %3), %%"REG_a" \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2661 // 0 1 2 3 4 5 6 7 8 9 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2662 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2663 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2664 "movq (%2), %%mm0 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2665 "movq (%%"REG_a"), %%mm1 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2666 "movq %%mm1, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2667 "movq %%mm1, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2668 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2669 "paddb %%mm7, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2670 "pcmpgtb %%mm6, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2671 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2672 "movq (%%"REG_a",%3), %%mm2 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2673 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2674 PMINUB(%%mm2, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2675 "psubb %%mm2, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2676 "paddb %%mm7, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2677 "pcmpgtb %%mm6, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2678 "paddb %%mm1, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2679 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2680 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2681 PMAXUB(%%mm1, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2682 PMINUB(%%mm1, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2683 "psubb %%mm1, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2684 "paddb %%mm7, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2685 "pcmpgtb %%mm6, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2686 "paddb %%mm2, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2687 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2688 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2689 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2690 "movq (%2, %3, 4), %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2691 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2692 PMINUB(%%mm2, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2693 "psubb %%mm2, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2694 "paddb %%mm7, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2695 "pcmpgtb %%mm6, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2696 "paddb %%mm1, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2697 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2698 "movq (%%"REG_a"), %%mm1 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2699 PMAXUB(%%mm1, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2700 PMINUB(%%mm1, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2701 "psubb %%mm1, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2702 "paddb %%mm7, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2703 "pcmpgtb %%mm6, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2704 "paddb %%mm2, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2705 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2706 "movq (%%"REG_a", %3), %%mm2 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2707 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2708 PMINUB(%%mm2, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2709 "psubb %%mm2, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2710 "paddb %%mm7, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2711 "pcmpgtb %%mm6, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2712 "paddb %%mm1, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2713 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2714 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2715 PMAXUB(%%mm1, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2716 PMINUB(%%mm1, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2717 "psubb %%mm1, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2718 "paddb %%mm7, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2719 "pcmpgtb %%mm6, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2720 "paddb %%mm2, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2721 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2722 "movq (%2, %3, 8), %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2723 PMAXUB(%%mm2, %%mm4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2724 PMINUB(%%mm2, %%mm3, %%mm5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2725 "psubb %%mm2, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2726 "paddb %%mm7, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2727 "pcmpgtb %%mm6, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2728 "paddb %%mm1, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2729 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2730 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2731 "psubb %%mm1, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2732 "paddb %%mm7, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2733 "pcmpgtb %%mm6, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2734 "paddb %%mm2, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2735 "psubusb %%mm3, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2736 |
2276 | 2737 "pxor %%mm6, %%mm6 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2738 "movq %4, %%mm7 \n\t" // QP,..., QP |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2739 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
2276 | 2740 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 |
2741 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 | |
2742 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2743 "movq %%mm7, %1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2744 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2745 "movq %5, %%mm7 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2746 "punpcklbw %%mm7, %%mm7 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2747 "punpcklbw %%mm7, %%mm7 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2748 "punpcklbw %%mm7, %%mm7 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2749 "psubb %%mm0, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2750 "pcmpgtb %%mm7, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2751 "movq %%mm6, %0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2752 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2753 : "=m" (eq_mask), "=m" (dc_mask) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2754 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2755 : "%"REG_a |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2756 ); |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2757 |
2040 | 2758 if(dc_mask & eq_mask){ |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2759 long offset= -8*step; |
2040 | 2760 int64_t *temp_sums= sums; |
2761 | |
2762 asm volatile( | |
2763 "movq %2, %%mm0 \n\t" // QP,..., QP | |
2764 "pxor %%mm4, %%mm4 \n\t" | |
2765 | |
2766 "movq (%0), %%mm6 \n\t" | |
2767 "movq (%0, %1), %%mm5 \n\t" | |
2768 "movq %%mm5, %%mm1 \n\t" | |
2769 "movq %%mm6, %%mm2 \n\t" | |
2770 "psubusb %%mm6, %%mm5 \n\t" | |
2771 "psubusb %%mm1, %%mm2 \n\t" | |
2772 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
2773 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
2774 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
2775 | |
2776 "pxor %%mm6, %%mm1 \n\t" | |
2777 "pand %%mm0, %%mm1 \n\t" | |
2778 "pxor %%mm1, %%mm6 \n\t" | |
2779 // 0:QP 6:First | |
2780 | |
2781 "movq (%0, %1, 8), %%mm5 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2782 "add %1, %0 \n\t" // %0 points to line 1 not 0 |
2040 | 2783 "movq (%0, %1, 8), %%mm7 \n\t" |
2784 "movq %%mm5, %%mm1 \n\t" | |
2785 "movq %%mm7, %%mm2 \n\t" | |
2786 "psubusb %%mm7, %%mm5 \n\t" | |
2787 "psubusb %%mm1, %%mm2 \n\t" | |
2788 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
2789 "movq %2, %%mm0 \n\t" // QP,..., QP | |
2790 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
2791 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
2792 | |
2793 "pxor %%mm7, %%mm1 \n\t" | |
2794 "pand %%mm0, %%mm1 \n\t" | |
2795 "pxor %%mm1, %%mm7 \n\t" | |
2796 | |
2797 "movq %%mm6, %%mm5 \n\t" | |
2798 "punpckhbw %%mm4, %%mm6 \n\t" | |
2799 "punpcklbw %%mm4, %%mm5 \n\t" | |
2800 // 4:0 5/6:First 7:Last | |
2801 | |
2802 "movq %%mm5, %%mm0 \n\t" | |
2803 "movq %%mm6, %%mm1 \n\t" | |
2804 "psllw $2, %%mm0 \n\t" | |
2805 "psllw $2, %%mm1 \n\t" | |
2806 "paddw "MANGLE(w04)", %%mm0 \n\t" | |
2807 "paddw "MANGLE(w04)", %%mm1 \n\t" | |
2808 | |
2809 #define NEXT\ | |
2810 "movq (%0), %%mm2 \n\t"\ | |
2811 "movq (%0), %%mm3 \n\t"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2812 "add %1, %0 \n\t"\ |
2040 | 2813 "punpcklbw %%mm4, %%mm2 \n\t"\ |
2814 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
2815 "paddw %%mm2, %%mm0 \n\t"\ | |
2816 "paddw %%mm3, %%mm1 \n\t" | |
2817 | |
2818 #define PREV\ | |
2819 "movq (%0), %%mm2 \n\t"\ | |
2820 "movq (%0), %%mm3 \n\t"\ | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2821 "add %1, %0 \n\t"\ |
2040 | 2822 "punpcklbw %%mm4, %%mm2 \n\t"\ |
2823 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
2824 "psubw %%mm2, %%mm0 \n\t"\ | |
2825 "psubw %%mm3, %%mm1 \n\t" | |
2826 | |
2827 | |
2828 NEXT //0 | |
2829 NEXT //1 | |
2830 NEXT //2 | |
2831 "movq %%mm0, (%3) \n\t" | |
2832 "movq %%mm1, 8(%3) \n\t" | |
2833 | |
2834 NEXT //3 | |
2835 "psubw %%mm5, %%mm0 \n\t" | |
2836 "psubw %%mm6, %%mm1 \n\t" | |
2837 "movq %%mm0, 16(%3) \n\t" | |
2838 "movq %%mm1, 24(%3) \n\t" | |
2839 | |
2840 NEXT //4 | |
2841 "psubw %%mm5, %%mm0 \n\t" | |
2842 "psubw %%mm6, %%mm1 \n\t" | |
2843 "movq %%mm0, 32(%3) \n\t" | |
2844 "movq %%mm1, 40(%3) \n\t" | |
2845 | |
2846 NEXT //5 | |
2847 "psubw %%mm5, %%mm0 \n\t" | |
2848 "psubw %%mm6, %%mm1 \n\t" | |
2849 "movq %%mm0, 48(%3) \n\t" | |
2850 "movq %%mm1, 56(%3) \n\t" | |
2851 | |
2852 NEXT //6 | |
2853 "psubw %%mm5, %%mm0 \n\t" | |
2854 "psubw %%mm6, %%mm1 \n\t" | |
2855 "movq %%mm0, 64(%3) \n\t" | |
2856 "movq %%mm1, 72(%3) \n\t" | |
2857 | |
2858 "movq %%mm7, %%mm6 \n\t" | |
2859 "punpckhbw %%mm4, %%mm7 \n\t" | |
2860 "punpcklbw %%mm4, %%mm6 \n\t" | |
2861 | |
2862 NEXT //7 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2863 "mov %4, %0 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2864 "add %1, %0 \n\t" |
2040 | 2865 PREV //0 |
2866 "movq %%mm0, 80(%3) \n\t" | |
2867 "movq %%mm1, 88(%3) \n\t" | |
2868 | |
2869 PREV //1 | |
2870 "paddw %%mm6, %%mm0 \n\t" | |
2871 "paddw %%mm7, %%mm1 \n\t" | |
2872 "movq %%mm0, 96(%3) \n\t" | |
2873 "movq %%mm1, 104(%3) \n\t" | |
2874 | |
2875 PREV //2 | |
2876 "paddw %%mm6, %%mm0 \n\t" | |
2877 "paddw %%mm7, %%mm1 \n\t" | |
2878 "movq %%mm0, 112(%3) \n\t" | |
2879 "movq %%mm1, 120(%3) \n\t" | |
2880 | |
2881 PREV //3 | |
2882 "paddw %%mm6, %%mm0 \n\t" | |
2883 "paddw %%mm7, %%mm1 \n\t" | |
2884 "movq %%mm0, 128(%3) \n\t" | |
2885 "movq %%mm1, 136(%3) \n\t" | |
2886 | |
2887 PREV //4 | |
2888 "paddw %%mm6, %%mm0 \n\t" | |
2889 "paddw %%mm7, %%mm1 \n\t" | |
2890 "movq %%mm0, 144(%3) \n\t" | |
2891 "movq %%mm1, 152(%3) \n\t" | |
2892 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2893 "mov %4, %0 \n\t" //FIXME |
2040 | 2894 |
2895 : "+&r"(src) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2896 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src) |
2040 | 2897 ); |
2898 | |
2899 src+= step; // src points to begin of the 8x8 Block | |
2900 | |
2901 asm volatile( | |
2902 "movq %4, %%mm6 \n\t" | |
2903 "pcmpeqb %%mm5, %%mm5 \n\t" | |
2904 "pxor %%mm6, %%mm5 \n\t" | |
2905 "pxor %%mm7, %%mm7 \n\t" | |
2906 | |
2907 "1: \n\t" | |
2908 "movq (%1), %%mm0 \n\t" | |
2909 "movq 8(%1), %%mm1 \n\t" | |
2910 "paddw 32(%1), %%mm0 \n\t" | |
2911 "paddw 40(%1), %%mm1 \n\t" | |
2912 "movq (%0, %3), %%mm2 \n\t" | |
2913 "movq %%mm2, %%mm3 \n\t" | |
2914 "movq %%mm2, %%mm4 \n\t" | |
2915 "punpcklbw %%mm7, %%mm2 \n\t" | |
2916 "punpckhbw %%mm7, %%mm3 \n\t" | |
2917 "paddw %%mm2, %%mm0 \n\t" | |
2918 "paddw %%mm3, %%mm1 \n\t" | |
2919 "paddw %%mm2, %%mm0 \n\t" | |
2920 "paddw %%mm3, %%mm1 \n\t" | |
2921 "psrlw $4, %%mm0 \n\t" | |
2922 "psrlw $4, %%mm1 \n\t" | |
2923 "packuswb %%mm1, %%mm0 \n\t" | |
2924 "pand %%mm6, %%mm0 \n\t" | |
2925 "pand %%mm5, %%mm4 \n\t" | |
2926 "por %%mm4, %%mm0 \n\t" | |
2927 "movq %%mm0, (%0, %3) \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2928 "add $16, %1 \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2929 "add %2, %0 \n\t" |
2040 | 2930 " js 1b \n\t" |
2931 | |
2932 : "+r"(offset), "+r"(temp_sums) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2933 : "r" ((long)step), "r"(src - offset), "m"(dc_mask & eq_mask) |
2040 | 2934 ); |
2935 }else | |
2936 src+= step; // src points to begin of the 8x8 Block | |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2937 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2938 if(eq_mask != -1LL){ |
2040 | 2939 uint8_t *temp_src= src; |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2940 asm volatile( |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2941 "pxor %%mm7, %%mm7 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2942 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2943 "and "ALIGN_MASK", %%"REG_c" \n\t" // align |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2944 // 0 1 2 3 4 5 6 7 8 9 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2945 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2946 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2947 "movq (%0), %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2948 "movq %%mm0, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2949 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2950 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2951 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2952 "movq (%0, %1), %%mm2 \n\t" |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2953 "lea (%0, %1, 2), %%"REG_a" \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2954 "movq %%mm2, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2955 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2956 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2957 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2958 "movq (%%"REG_a"), %%mm4 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2959 "movq %%mm4, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2960 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2961 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2962 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2963 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2964 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2965 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2966 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2967 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2968 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2969 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2970 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2971 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2972 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2973 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2974 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2975 "movq (%%"REG_a", %1), %%mm2 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2976 "movq %%mm2, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2977 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2978 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2979 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2980 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2981 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2982 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2983 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2984 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2985 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2986 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2987 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2988 "movq %%mm0, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2989 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2990 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2991 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2992 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2993 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2994 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
2995 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2996 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2997 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2998 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2999 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3000 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3001 "lea (%%"REG_a", %1), %0 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3002 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3003 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3004 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3005 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3006 //50 opcodes so far |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3007 "movq (%0, %1, 2), %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3008 "movq %%mm2, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3009 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3010 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3011 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3012 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3013 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3014 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3015 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3016 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3017 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3018 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3019 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3020 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3021 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3022 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3023 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3024 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3025 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3026 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3027 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3028 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3029 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3030 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3031 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3032 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3033 "movq (%0, %1, 4), %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3034 "movq %%mm2, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3035 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3036 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3037 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3038 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3039 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3040 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3041 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3042 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3043 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3044 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3045 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3046 #ifdef HAVE_MMX2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3047 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3048 "psubw %%mm0, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3049 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3050 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3051 "psubw %%mm1, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3052 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3053 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3054 "psubw %%mm2, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3055 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3056 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3057 "psubw %%mm3, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3058 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3059 #else |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3060 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3061 "pcmpgtw %%mm0, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3062 "pxor %%mm6, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3063 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3064 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3065 "pcmpgtw %%mm1, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3066 "pxor %%mm6, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3067 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3068 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3069 "pcmpgtw %%mm2, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3070 "pxor %%mm6, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3071 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3072 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3073 "pcmpgtw %%mm3, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3074 "pxor %%mm6, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3075 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3076 #endif |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3077 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3078 #ifdef HAVE_MMX2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3079 "pminsw %%mm2, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3080 "pminsw %%mm3, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3081 #else |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3082 "movq %%mm0, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3083 "psubusw %%mm2, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3084 "psubw %%mm6, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3085 "movq %%mm1, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3086 "psubusw %%mm3, %%mm6 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3087 "psubw %%mm6, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3088 #endif |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3089 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3090 "movd %2, %%mm2 \n\t" // QP |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3091 "punpcklbw %%mm7, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3092 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3093 "movq %%mm7, %%mm6 \n\t" // 0 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3094 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3095 "pxor %%mm6, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3096 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3097 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3098 "pxor %%mm7, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3099 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3100 // 100 opcodes |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3101 "psllw $3, %%mm2 \n\t" // 8QP |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3102 "movq %%mm2, %%mm3 \n\t" // 8QP |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3103 "pcmpgtw %%mm4, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3104 "pcmpgtw %%mm5, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3105 "pand %%mm2, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3106 "pand %%mm3, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3107 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3108 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3109 "psubusw %%mm0, %%mm4 \n\t" // hd |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3110 "psubusw %%mm1, %%mm5 \n\t" // ld |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3111 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3112 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3113 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3114 "pmullw %%mm2, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3115 "pmullw %%mm2, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3116 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3117 "paddw %%mm2, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3118 "paddw %%mm2, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3119 "psrlw $6, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3120 "psrlw $6, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3121 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3122 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3123 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3124 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3125 "pxor %%mm2, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3126 "pxor %%mm3, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3127 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3128 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3129 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3130 "pxor %%mm2, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3131 "pxor %%mm3, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3132 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3133 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3134 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3135 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3136 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3137 "pxor %%mm6, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3138 "pxor %%mm7, %%mm3 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3139 "pand %%mm2, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3140 "pand %%mm3, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3141 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3142 #ifdef HAVE_MMX2 |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3143 "pminsw %%mm0, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3144 "pminsw %%mm1, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3145 #else |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3146 "movq %%mm4, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3147 "psubusw %%mm0, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3148 "psubw %%mm2, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3149 "movq %%mm5, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3150 "psubusw %%mm1, %%mm2 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3151 "psubw %%mm2, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3152 #endif |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3153 "pxor %%mm6, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3154 "pxor %%mm7, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3155 "psubw %%mm6, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3156 "psubw %%mm7, %%mm5 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3157 "packsswb %%mm5, %%mm4 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3158 "movq %3, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3159 "pandn %%mm4, %%mm1 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3160 "movq (%0), %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3161 "paddb %%mm1, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3162 "movq %%mm0, (%0) \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3163 "movq (%0, %1), %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3164 "psubb %%mm1, %%mm0 \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3165 "movq %%mm0, (%0, %1) \n\t" |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3166 |
2040 | 3167 : "+r" (temp_src) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3168 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3169 : "%"REG_a, "%"REG_c |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3170 ); |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3171 } |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3172 /*if(step==16){ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3173 STOP_TIMER("step16") |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3174 }else{ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3175 STOP_TIMER("stepX") |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3176 }*/ |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3177 } |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3178 #endif //HAVE_MMX |
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3179 |
169 | 3180 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
787 | 3181 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
96 | 3182 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3183 /** |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3184 * Copies a block from src to dst and fixes the blacklevel |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3185 * levelFix == 0 -> dont touch the brighness & contrast |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3186 */ |
634
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
3187 #undef SCALED_CPY |
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
3188 |
169 | 3189 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
787 | 3190 int levelFix, int64_t *packedOffsetAndScale) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3191 { |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3192 #ifndef HAVE_MMX |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3193 int i; |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3194 #endif |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3195 if(levelFix) |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3196 { |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3197 #ifdef HAVE_MMX |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3198 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3199 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3200 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3201 "lea (%2,%4), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3202 "lea (%3,%5), %%"REG_d" \n\t" |
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3203 "pxor %%mm4, %%mm4 \n\t" |
173 | 3204 #ifdef HAVE_MMX2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3205 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
173 | 3206 "movq " #src1 ", %%mm0 \n\t"\ |
3207 "movq " #src1 ", %%mm5 \n\t"\ | |
3208 "movq " #src2 ", %%mm1 \n\t"\ | |
3209 "movq " #src2 ", %%mm6 \n\t"\ | |
3210 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
3211 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
3212 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
3213 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
3214 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
3215 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
3216 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
3217 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
3218 "psubw %%mm2, %%mm0 \n\t"\ | |
3219 "psubw %%mm2, %%mm5 \n\t"\ | |
3220 "psubw %%mm2, %%mm1 \n\t"\ | |
3221 "psubw %%mm2, %%mm6 \n\t"\ | |
3222 "packuswb %%mm5, %%mm0 \n\t"\ | |
3223 "packuswb %%mm6, %%mm1 \n\t"\ | |
3224 "movq %%mm0, " #dst1 " \n\t"\ | |
3225 "movq %%mm1, " #dst2 " \n\t"\ | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3226 |
173 | 3227 #else //HAVE_MMX2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3228 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
166 | 3229 "movq " #src1 ", %%mm0 \n\t"\ |
3230 "movq " #src1 ", %%mm5 \n\t"\ | |
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3231 "punpcklbw %%mm4, %%mm0 \n\t"\ |
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3232 "punpckhbw %%mm4, %%mm5 \n\t"\ |
117 | 3233 "psubw %%mm2, %%mm0 \n\t"\ |
3234 "psubw %%mm2, %%mm5 \n\t"\ | |
166 | 3235 "movq " #src2 ", %%mm1 \n\t"\ |
117 | 3236 "psllw $6, %%mm0 \n\t"\ |
3237 "psllw $6, %%mm5 \n\t"\ | |
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3238 "pmulhw %%mm3, %%mm0 \n\t"\ |
166 | 3239 "movq " #src2 ", %%mm6 \n\t"\ |
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3240 "pmulhw %%mm3, %%mm5 \n\t"\ |
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3241 "punpcklbw %%mm4, %%mm1 \n\t"\ |
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3242 "punpckhbw %%mm4, %%mm6 \n\t"\ |
117 | 3243 "psubw %%mm2, %%mm1 \n\t"\ |
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3244 "psubw %%mm2, %%mm6 \n\t"\ |
117 | 3245 "psllw $6, %%mm1 \n\t"\ |
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3246 "psllw $6, %%mm6 \n\t"\ |
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3247 "pmulhw %%mm3, %%mm1 \n\t"\ |
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3248 "pmulhw %%mm3, %%mm6 \n\t"\ |
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3249 "packuswb %%mm5, %%mm0 \n\t"\ |
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3250 "packuswb %%mm6, %%mm1 \n\t"\ |
166 | 3251 "movq %%mm0, " #dst1 " \n\t"\ |
3252 "movq %%mm1, " #dst2 " \n\t"\ | |
3253 | |
173 | 3254 #endif //!HAVE_MMX2 |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3255 #define SCALED_CPY(src1, src2, dst1, dst2)\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3256 REAL_SCALED_CPY(src1, src2, dst1, dst2) |
173 | 3257 |
787 | 3258 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3259 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3260 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3261 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3262 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3263 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) |
166 | 3264 |
3265 | |
787 | 3266 : "=&a" (packedOffsetAndScale) |
3267 : "0" (packedOffsetAndScale), | |
3268 "r"(src), | |
166 | 3269 "r"(dst), |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3270 "r" ((long)srcStride), |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3271 "r" ((long)dstStride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3272 : "%"REG_d |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3273 ); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3274 #else |
164 | 3275 for(i=0; i<8; i++) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3276 memcpy( &(dst[dstStride*i]), |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3277 &(src[srcStride*i]), BLOCK_SIZE); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3278 #endif |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3279 } |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3280 else |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3281 { |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3282 #ifdef HAVE_MMX |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3283 asm volatile( |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3284 "lea (%0,%2), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3285 "lea (%1,%3), %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3286 |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3287 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ |
166 | 3288 "movq " #src1 ", %%mm0 \n\t"\ |
3289 "movq " #src2 ", %%mm1 \n\t"\ | |
3290 "movq %%mm0, " #dst1 " \n\t"\ | |
3291 "movq %%mm1, " #dst2 " \n\t"\ | |
3292 | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3293 #define SIMPLE_CPY(src1, src2, dst1, dst2)\ |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3294 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3295 |
166 | 3296 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3297 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3298 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3299 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3300 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3301 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) |
166 | 3302 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3303 : : "r" (src), |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3304 "r" (dst), |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3305 "r" ((long)srcStride), |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3306 "r" ((long)dstStride) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3307 : "%"REG_a, "%"REG_d |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3308 ); |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3309 #else |
164 | 3310 for(i=0; i<8; i++) |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3311 memcpy( &(dst[dstStride*i]), |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3312 &(src[srcStride*i]), BLOCK_SIZE); |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3313 #endif |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3314 } |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3315 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3316 |
224 | 3317 /** |
3318 * Duplicates the given 8 src pixels ? times upward | |
3319 */ | |
3320 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
3321 { | |
3322 #ifdef HAVE_MMX | |
3323 asm volatile( | |
3324 "movq (%0), %%mm0 \n\t" | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3325 "add %1, %0 \n\t" |
224 | 3326 "movq %%mm0, (%0) \n\t" |
3327 "movq %%mm0, (%0, %1) \n\t" | |
3328 "movq %%mm0, (%0, %1, 2) \n\t" | |
3329 : "+r" (src) | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3330 : "r" ((long)-stride) |
224 | 3331 ); |
3332 #else | |
3333 int i; | |
3334 uint8_t *p=src; | |
3335 for(i=0; i<3; i++) | |
3336 { | |
3337 p-= stride; | |
3338 memcpy(p, src, 8); | |
3339 } | |
3340 #endif | |
3341 } | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3342 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3343 /** |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3344 * Filters array of bytes (Y or U or V values) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3345 */ |
169 | 3346 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
787 | 3347 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3348 { |
787 | 3349 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3350 int x,y; |
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3351 #ifdef COMPILE_TIME_MODE |
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3352 const int mode= COMPILE_TIME_MODE; |
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3353 #else |
787 | 3354 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3355 #endif |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3356 int black=0, white=255; // blackest black and whitest white in the picture |
223 | 3357 int QPCorrecture= 256*256; |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3358 |
886
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3359 int copyAhead; |
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3360 #ifdef HAVE_MMX |
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3361 int i; |
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3362 #endif |
164 | 3363 |
957 | 3364 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
3365 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; | |
3366 | |
787 | 3367 //FIXME remove |
3368 uint64_t * const yHistogram= c.yHistogram; | |
2527 | 3369 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; |
3370 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; | |
2031
4225c131a2eb
warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1724
diff
changeset
|
3371 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
3372 |
158 | 3373 #ifdef HAVE_MMX |
1724 | 3374 for(i=0; i<57; i++){ |
791 | 3375 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
3376 int threshold= offset*2 + 1; | |
3377 c.mmxDcOffset[i]= 0x7F - offset; | |
3378 c.mmxDcThreshold[i]= 0x7F - threshold; | |
3379 c.mmxDcOffset[i]*= 0x0101010101010101LL; | |
3380 c.mmxDcThreshold[i]*= 0x0101010101010101LL; | |
3381 } | |
158 | 3382 #endif |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3383 |
164 | 3384 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
787 | 3385 else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
1157 | 3386 || (mode & FFMPEG_DEINT_FILTER) |
3387 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; | |
164 | 3388 else if( (mode & V_DEBLOCK) |
3389 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3390 || (mode & MEDIAN_DEINT_FILTER) |
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3391 || (mode & V_A_DEBLOCK)) copyAhead=13; |
164 | 3392 else if(mode & V_X1_FILTER) copyAhead=11; |
787 | 3393 // else if(mode & V_RK1_FILTER) copyAhead=10; |
164 | 3394 else if(mode & DERING) copyAhead=9; |
3395 else copyAhead=8; | |
3396 | |
3397 copyAhead-= 8; | |
3398 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3399 if(!isColor) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3400 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3401 uint64_t sum= 0; |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3402 int i; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3403 uint64_t maxClipped; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3404 uint64_t clipped; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3405 double scale; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3406 |
787 | 3407 c.frameNum++; |
3408 // first frame is fscked so we ignore it | |
3409 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3410 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3411 for(i=0; i<256; i++) |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3412 { |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3413 sum+= yHistogram[i]; |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3414 // printf("%d ", yHistogram[i]); |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3415 } |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3416 // printf("\n\n"); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3417 |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3418 /* we allways get a completly black picture first */ |
793 | 3419 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3420 |
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3421 clipped= sum; |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3422 for(black=255; black>0; black--) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3423 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3424 if(clipped < maxClipped) break; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3425 clipped-= yHistogram[black]; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3426 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3427 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3428 clipped= sum; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3429 for(white=0; white<256; white++) |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3430 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3431 if(clipped < maxClipped) break; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3432 clipped-= yHistogram[white]; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3433 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3434 |
787 | 3435 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
173 | 3436 |
3437 #ifdef HAVE_MMX2 | |
787 | 3438 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
3439 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | |
173 | 3440 #else |
787 | 3441 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
3442 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | |
173 | 3443 #endif |
3444 | |
787 | 3445 c.packedYOffset|= c.packedYOffset<<32; |
3446 c.packedYOffset|= c.packedYOffset<<16; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3447 |
787 | 3448 c.packedYScale|= c.packedYScale<<32; |
3449 c.packedYScale|= c.packedYScale<<16; | |
223 | 3450 |
3451 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
3452 else QPCorrecture= 256*256; | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3453 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3454 else |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3455 { |
787 | 3456 c.packedYScale= 0x0100010001000100LL; |
3457 c.packedYOffset= 0; | |
223 | 3458 QPCorrecture= 256*256; |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3459 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3460 |
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
3461 /* copy & deinterlace first row of blocks */ |
142 | 3462 y=-BLOCK_SIZE; |
3463 { | |
3464 uint8_t *srcBlock= &(src[y*srcStride]); | |
224 | 3465 uint8_t *dstBlock= tempDst + dstStride; |
142 | 3466 |
3467 // From this point on it is guranteed that we can read and write 16 lines downward | |
3468 // finish 1 block before the next otherwise we´ll might have a problem | |
3469 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
3470 for(x=0; x<width; x+=BLOCK_SIZE) | |
3471 { | |
3472 | |
3473 #ifdef HAVE_MMX2 | |
3474 /* | |
3475 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
3476 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
3477 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
3478 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
3479 */ | |
3480 | |
3481 asm( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3482 "mov %4, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3483 "shr $2, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3484 "and $6, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3485 "add %5, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3486 "mov %%"REG_a", %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3487 "imul %1, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3488 "imul %3, %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3489 "prefetchnta 32(%%"REG_a", %0) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3490 "prefetcht0 32(%%"REG_d", %2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3491 "add %1, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3492 "add %3, %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3493 "prefetchnta 32(%%"REG_a", %0) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3494 "prefetcht0 32(%%"REG_d", %2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3495 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3496 "m" ((long)x), "m" ((long)copyAhead) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3497 : "%"REG_a, "%"REG_d |
142 | 3498 ); |
3499 | |
3500 #elif defined(HAVE_3DNOW) | |
3501 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
3502 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
3503 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3504 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3505 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
3506 */ | |
3507 #endif | |
3508 | |
224 | 3509 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
787 | 3510 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
224 | 3511 |
3512 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
142 | 3513 |
3514 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
169 | 3515 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
142 | 3516 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
1581 | 3517 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
142 | 3518 else if(mode & MEDIAN_DEINT_FILTER) |
169 | 3519 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
142 | 3520 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
169 | 3521 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
787 | 3522 else if(mode & FFMPEG_DEINT_FILTER) |
3523 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
1157 | 3524 else if(mode & LOWPASS5_DEINT_FILTER) |
3525 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
142 | 3526 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
169 | 3527 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
142 | 3528 */ |
3529 dstBlock+=8; | |
3530 srcBlock+=8; | |
3531 } | |
2527 | 3532 if(width==ABS(dstStride)) |
3533 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); | |
941 | 3534 else |
3535 { | |
943 | 3536 int i; |
941 | 3537 for(i=0; i<copyAhead; i++) |
3538 { | |
3539 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); | |
3540 } | |
3541 } | |
142 | 3542 } |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3543 |
787 | 3544 //printf("\n"); |
111 | 3545 for(y=0; y<height; y+=BLOCK_SIZE) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3546 { |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3547 //1% speedup if these are here instead of the inner loop |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3548 uint8_t *srcBlock= &(src[y*srcStride]); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3549 uint8_t *dstBlock= &(dst[y*dstStride]); |
169 | 3550 #ifdef HAVE_MMX |
787 | 3551 uint8_t *tempBlock1= c.tempBlocks; |
3552 uint8_t *tempBlock2= c.tempBlocks + 8; | |
169 | 3553 #endif |
957 | 3554 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
2527 | 3555 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*ABS(QPStride)]; |
156 | 3556 int QP=0; |
130 | 3557 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
3558 if not than use a temporary buffer */ | |
111 | 3559 if(y+15 >= height) |
3560 { | |
156 | 3561 int i; |
164 | 3562 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
111 | 3563 blockcopy to dst later */ |
2527 | 3564 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
3565 MAX(height-y-copyAhead, 0), srcStride); | |
164 | 3566 |
3567 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
3568 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
2527 | 3569 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), ABS(srcStride)); |
156 | 3570 |
164 | 3571 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
2527 | 3572 linecpy(tempDst, dstBlock - dstStride, MIN(height-y+1, copyAhead+1), dstStride); |
164 | 3573 |
3574 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
3575 for(i=height-y+1; i<=copyAhead; i++) | |
2527 | 3576 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), ABS(dstStride)); |
156 | 3577 |
130 | 3578 dstBlock= tempDst + dstStride; |
111 | 3579 srcBlock= tempSrc; |
3580 } | |
787 | 3581 //printf("\n"); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3582 |
112 | 3583 // From this point on it is guranteed that we can read and write 16 lines downward |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3584 // finish 1 block before the next otherwise we´ll might have a problem |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3585 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3586 for(x=0; x<width; x+=BLOCK_SIZE) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3587 { |
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3588 const int stride= dstStride; |
169 | 3589 #ifdef HAVE_MMX |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3590 uint8_t *tmpXchg; |
169 | 3591 #endif |
791 | 3592 if(isColor) |
121 | 3593 { |
957 | 3594 QP= QPptr[x>>qpHShift]; |
3595 c.nonBQP= nonBQPptr[x>>qpHShift]; | |
791 | 3596 } |
3597 else | |
3598 { | |
3599 QP= QPptr[x>>4]; | |
223 | 3600 QP= (QP* QPCorrecture + 256*128)>>16; |
791 | 3601 c.nonBQP= nonBQPptr[x>>4]; |
3602 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; | |
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
3603 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
121 | 3604 } |
787 | 3605 c.QP= QP; |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3606 #ifdef HAVE_MMX |
111 | 3607 asm volatile( |
787 | 3608 "movd %1, %%mm7 \n\t" |
111 | 3609 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
3610 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
3611 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
787 | 3612 "movq %%mm7, %0 \n\t" |
3613 : "=m" (c.pQPb) | |
3614 : "r" (QP) | |
111 | 3615 ); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3616 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3617 |
96 | 3618 |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3619 #ifdef HAVE_MMX2 |
126 | 3620 /* |
3621 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
3622 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
3623 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
3624 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
3625 */ | |
3626 | |
3627 asm( | |
2293
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3628 "mov %4, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3629 "shr $2, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3630 "and $6, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3631 "add %5, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3632 "mov %%"REG_a", %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3633 "imul %1, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3634 "imul %3, %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3635 "prefetchnta 32(%%"REG_a", %0) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3636 "prefetcht0 32(%%"REG_d", %2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3637 "add %1, %%"REG_a" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3638 "add %3, %%"REG_d" \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3639 "prefetchnta 32(%%"REG_a", %0) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3640 "prefetcht0 32(%%"REG_d", %2) \n\t" |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3641 :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3642 "m" ((long)x), "m" ((long)copyAhead) |
15cfba1b97b5
adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
michael
parents:
2276
diff
changeset
|
3643 : "%"REG_a, "%"REG_d |
126 | 3644 ); |
3645 | |
96 | 3646 #elif defined(HAVE_3DNOW) |
3647 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
111 | 3648 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
3649 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
3650 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
3651 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
96 | 3652 */ |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3653 #endif |
111 | 3654 |
169 | 3655 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
787 | 3656 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3657 |
111 | 3658 if(mode & LINEAR_IPOL_DEINT_FILTER) |
169 | 3659 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
111 | 3660 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
1581 | 3661 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
111 | 3662 else if(mode & MEDIAN_DEINT_FILTER) |
169 | 3663 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
111 | 3664 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
169 | 3665 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
787 | 3666 else if(mode & FFMPEG_DEINT_FILTER) |
3667 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
1157 | 3668 else if(mode & LOWPASS5_DEINT_FILTER) |
3669 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
111 | 3670 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
169 | 3671 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
3672 */ |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3673 |
111 | 3674 /* only deblock if we have 2 blocks */ |
3675 if(y + 8 < height) | |
3676 { | |
787 | 3677 if(mode & V_X1_FILTER) |
3678 RENAME(vertX1Filter)(dstBlock, stride, &c); | |
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3679 else if(mode & V_DEBLOCK) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3680 { |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3681 const int t= RENAME(vertClassify)(dstBlock, stride, &c); |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3682 |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3683 if(t==1) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3684 RENAME(doVertLowPass)(dstBlock, stride, &c); |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3685 else if(t==2) |
787 | 3686 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3687 }else if(mode & V_A_DEBLOCK){ |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3688 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3689 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3690 } |
130 | 3691 |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3692 #ifdef HAVE_MMX |
169 | 3693 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3694 #endif |
111 | 3695 /* check if we have a previous block to deblock it with dstBlock */ |
112 | 3696 if(x - 8 >= 0) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3697 { |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3698 #ifdef HAVE_MMX |
787 | 3699 if(mode & H_X1_FILTER) |
3700 RENAME(vertX1Filter)(tempBlock1, 16, &c); | |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3701 else if(mode & H_DEBLOCK) |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3702 { |
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3703 //START_TIMER |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3704 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3705 //STOP_TIMER("dc & minmax") |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3706 if(t==1) |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3707 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3708 else if(t==2) |
787 | 3709 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3710 }else if(mode & H_A_DEBLOCK){ |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3711 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3712 } |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3713 |
169 | 3714 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3715 |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3716 #else |
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3717 if(mode & H_X1_FILTER) |
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3718 horizX1Filter(dstBlock-4, stride, QP); |
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3719 else if(mode & H_DEBLOCK) |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3720 { |
2043 | 3721 #ifdef HAVE_ALTIVEC |
3722 unsigned char __attribute__ ((aligned(16))) tempBlock[272]; | |
3723 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); | |
3724 | |
3725 const int t=vertClassify_altivec(tempBlock-48, 16, &c); | |
3726 if(t==1) { | |
3727 doVertLowPass_altivec(tempBlock-48, 16, &c); | |
3728 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); | |
3729 } | |
3730 else if(t==2) { | |
3731 doVertDefFilter_altivec(tempBlock-48, 16, &c); | |
3732 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); | |
3733 } | |
3734 #else | |
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3735 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3736 |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3737 if(t==1) |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3738 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3739 else if(t==2) |
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3740 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); |
2043 | 3741 #endif |
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3742 }else if(mode & H_A_DEBLOCK){ |
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3743 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3744 } |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3745 #endif |
130 | 3746 if(mode & DERING) |
3747 { | |
3748 //FIXME filter first line | |
787 | 3749 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
130 | 3750 } |
156 | 3751 |
3752 if(mode & TEMP_NOISE_FILTER) | |
3753 { | |
169 | 3754 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
787 | 3755 c.tempBlured[isColor] + y*dstStride + x, |
3756 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
3757 c.ppMode.maxTmpNoise); | |
156 | 3758 } |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3759 } |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3760 |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3761 dstBlock+=8; |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3762 srcBlock+=8; |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3763 |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3764 #ifdef HAVE_MMX |
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3765 tmpXchg= tempBlock1; |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3766 tempBlock1= tempBlock2; |
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3767 tempBlock2 = tmpXchg; |
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3768 #endif |
111 | 3769 } |
3770 | |
156 | 3771 if(mode & DERING) |
3772 { | |
787 | 3773 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
156 | 3774 } |
3775 | |
3776 if((mode & TEMP_NOISE_FILTER)) | |
3777 { | |
169 | 3778 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
787 | 3779 c.tempBlured[isColor] + y*dstStride + x, |
3780 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
3781 c.ppMode.maxTmpNoise); | |
156 | 3782 } |
3783 | |
142 | 3784 /* did we use a tmp buffer for the last lines*/ |
112 | 3785 if(y+15 >= height) |
111 | 3786 { |
3787 uint8_t *dstBlock= &(dst[y*dstStride]); | |
2527 | 3788 if(width==ABS(dstStride)) |
3789 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); | |
941 | 3790 else |
3791 { | |
944 | 3792 int i; |
941 | 3793 for(i=0; i<height-y; i++) |
3794 { | |
3795 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); | |
3796 } | |
3797 } | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3798 } |
163 | 3799 /* |
3800 for(x=0; x<width; x+=32) | |
3801 { | |
164 | 3802 volatile int i; |
163 | 3803 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
3804 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
164 | 3805 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
3806 // + dstBlock[x +13*dstStride] | |
3807 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
3808 }*/ | |
3809 } | |
96 | 3810 #ifdef HAVE_3DNOW |
3811 asm volatile("femms"); | |
3812 #elif defined (HAVE_MMX) | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3813 asm volatile("emms"); |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3814 #endif |
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3815 |
163 | 3816 #ifdef DEBUG_BRIGHTNESS |
3817 if(!isColor) | |
3818 { | |
3819 int max=1; | |
3820 int i; | |
3821 for(i=0; i<256; i++) | |
3822 if(yHistogram[i] > max) max=yHistogram[i]; | |
3823 | |
3824 for(i=1; i<256; i++) | |
3825 { | |
3826 int x; | |
3827 int start=yHistogram[i-1]/(max/256+1); | |
3828 int end=yHistogram[i]/(max/256+1); | |
3829 int inc= end > start ? 1 : -1; | |
3830 for(x=start; x!=end+inc; x+=inc) | |
3831 dst[ i*dstStride + x]+=128; | |
3832 } | |
3833 | |
3834 for(i=0; i<100; i+=2) | |
3835 { | |
3836 dst[ (white)*dstStride + i]+=128; | |
3837 dst[ (black)*dstStride + i]+=128; | |
3838 } | |
3839 | |
3840 } | |
3841 #endif | |
3842 | |
787 | 3843 *c2= c; //copy local context back |
3844 | |
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3845 } |