annotate libmpcodecs/vf_fspp.c @ 30936:50b51e6987bd

Replace some "m" constraints by MANGLE to avoid issues with some compilers not being able to compile it and deduplicate the code at the same time.
author reimar
date Wed, 31 Mar 2010 17:00:33 +0000
parents a972c1a4a012
children f957f330aa6d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1 /*
26727
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
2 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
3 * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
4 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
5 * This file is part of MPlayer.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
6 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
7 * MPlayer is free software; you can redistribute it and/or modify
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
8 * it under the terms of the GNU General Public License as published by
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
9 * the Free Software Foundation; either version 2 of the License, or
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
10 * (at your option) any later version.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
11 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
12 * MPlayer is distributed in the hope that it will be useful,
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
15 * GNU General Public License for more details.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
16 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
17 * You should have received a copy of the GNU General Public License along
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
18 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
20 */
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
21
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
22 /*
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
23 * This implementation is based on an algorithm described in
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
24 * "Aria Nosratinia Embedded Post-Processing for
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
25 * Enhancement of Compressed Images (1999)"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
26 * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
27 * Futher, with splitting (i)dct into hor/ver passes, one of them can be
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
28 * performed once per block, not pixel. This allows for much better speed.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
29 */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
30
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
31 /*
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
32 Heavily optimized version of SPP filter by Nikolaj
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
33 */
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
34
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
35 #include <stdio.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
36 #include <stdlib.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
37 #include <string.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
38 #include <inttypes.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
39 #include <math.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
40
17012
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
41 #include "config.h"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
42
17012
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
43 #include "mp_msg.h"
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
44 #include "cpudetect.h"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
45 #include "img_format.h"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
46 #include "mp_image.h"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
47 #include "vf.h"
17012
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
48 #include "libvo/fastmemcpy.h"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
49
28588
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
50 #include "libavutil/internal.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
51 #include "libavutil/intreadwrite.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
52 #include "libavutil/mem.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
53 #include "libavcodec/avcodec.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
54 #include "libavcodec/dsputil.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
55
28327
c39a1fd7d45c Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
diego
parents: 28290
diff changeset
56 #undef free
c39a1fd7d45c Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
diego
parents: 28290
diff changeset
57 #undef malloc
c39a1fd7d45c Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
diego
parents: 28290
diff changeset
58
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
59 //===========================================================================//
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
60 #define BLOCKSZ 12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
61
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
62 static const short custom_threshold[64]=
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
63 // values (296) can't be too high
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
64 // -it causes too big quant dependence
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
65 // or maybe overflow(check), which results in some flashing
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
66 { 71, 296, 295, 237, 71, 40, 38, 19,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
67 245, 193, 185, 121, 102, 73, 53, 27,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
68 158, 129, 141, 107, 97, 73, 50, 26,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
69 102, 116, 109, 98, 82, 66, 45, 23,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
70 71, 94, 95, 81, 70, 56, 38, 20,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
71 56, 77, 74, 66, 56, 44, 30, 15,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
72 38, 53, 50, 45, 38, 30, 21, 11,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
73 20, 27, 26, 23, 20, 15, 11, 5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
74 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
75
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
76 static const uint8_t __attribute__((aligned(32))) dither[8][8]={
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
77 { 0, 48, 12, 60, 3, 51, 15, 63, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
78 { 32, 16, 44, 28, 35, 19, 47, 31, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
79 { 8, 56, 4, 52, 11, 59, 7, 55, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
80 { 40, 24, 36, 20, 43, 27, 39, 23, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
81 { 2, 50, 14, 62, 1, 49, 13, 61, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
82 { 34, 18, 46, 30, 33, 17, 45, 29, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
83 { 10, 58, 6, 54, 9, 57, 5, 53, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
84 { 42, 26, 38, 22, 41, 25, 37, 21, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
85 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
86
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
87 struct vf_priv_s { //align 16 !
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
88 uint64_t threshold_mtx_noq[8*2];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
89 uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
90
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
91 int log2_count;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
92 int temp_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
93 int qp;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
94 int mpeg2;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
95 int prev_q;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
96 uint8_t *src;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
97 int16_t *temp;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
98 int bframes;
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
99 char *non_b_qp;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
100 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
101
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
102
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
103 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
104
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
105 //This func reads from 1 slice, 1 and clears 0 & 1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
106 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
107 {int y, x;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
108 #define STORE(pos) \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
109 temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
110 src[x + pos]=src[x + pos - 8*src_stride]=0; \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
111 if(temp & 0x100) temp= ~(temp>>31); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
112 dst[x + pos]= temp;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
113
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
114 for(y=0; y<height; y++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
115 const uint8_t *d= dither[y];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
116 for(x=0; x<width; x+=8){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
117 int temp;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
118 STORE(0);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
119 STORE(1);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
120 STORE(2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
121 STORE(3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
122 STORE(4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
123 STORE(5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
124 STORE(6);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
125 STORE(7);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
126 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
127 src+=src_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
128 dst+=dst_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
129 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
130 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
131
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
132 //This func reads from 2 slices, 0 & 2 and clears 2-nd
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
133 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
134 {int y, x;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
135 #define STORE2(pos) \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
136 temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
137 src[x + pos + 16*src_stride]=0; \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
138 if(temp & 0x100) temp= ~(temp>>31); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
139 dst[x + pos]= temp;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
140
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
141 for(y=0; y<height; y++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
142 const uint8_t *d= dither[y];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
143 for(x=0; x<width; x+=8){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
144 int temp;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
145 STORE2(0);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
146 STORE2(1);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
147 STORE2(2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
148 STORE2(3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
149 STORE2(4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
150 STORE2(5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
151 STORE2(6);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
152 STORE2(7);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
153 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
154 src+=src_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
155 dst+=dst_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
156 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
157 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
158
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
159 static void mul_thrmat_c(struct vf_priv_s *p,int q)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
160 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
161 int a;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
162 for(a=0;a<64;a++)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
163 ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
164 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
165
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
166 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
167 static void row_idct_c(DCTELEM* workspace,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
168 int16_t* output_adr, int output_stride, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
169 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
170
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
171 //this is rather ugly, but there is no need for function pointers
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
172 #define store_slice_s store_slice_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
173 #define store_slice2_s store_slice2_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
174 #define mul_thrmat_s mul_thrmat_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
175 #define column_fidct_s column_fidct_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
176 #define row_idct_s row_idct_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
177 #define row_fdct_s row_fdct_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
178
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
179 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
180
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
181 //This func reads from 1 slice, 1 and clears 0 & 1
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
182 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
183 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
184 const uint8_t *od=&dither[0][0];
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
185 const uint8_t *end=&dither[height][0];
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
186 width = (width+7)&~7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
187 dst_stride-=width;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
188 //src_stride=(src_stride-width)*2;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
189 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
190 "mov %5, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
191 "mov %6, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
192 "mov %7, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
193 "mov %1, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
194 "movd %%"REG_d", %%mm5 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
195 "xor $-1, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
196 "mov %%"REG_a", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
197 "add $7, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
198 "neg %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
199 "sub %0, %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
200 "add %%"REG_c", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
201 "movd %%"REG_d", %%mm2 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
202 "mov %%"REG_c", %1 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
203 "mov %2, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
204 "shl $4, %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
205
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
206 "2: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
207 "movq (%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
208 "movq %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
209 "pxor %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
210 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
211 "punpckhbw %%mm7, %%mm4 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
212 "mov %0, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
213 "psraw %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
214 "psraw %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
215 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
216 "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
217 "movq (%%"REG_S"), %%mm0 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
218 "movq 8(%%"REG_S"), %%mm1 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
219
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
220 "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
221 "paddw %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
222 "paddw %%mm4, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
223
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
224 "movq %%mm7, (%%"REG_S") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
225 "psraw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
226 "psraw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
227
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
228 "movq %%mm7, 8(%%"REG_S") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
229 "packuswb %%mm1, %%mm0 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
230 "add $16, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
231
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
232 "movq %%mm0, (%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
233 "add $8, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
234 "sub $8, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
235 "jg 1b \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
236 "add %1, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
237 "add $8, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
238 "add %3, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
239 "cmp %4, %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
240 "jl 2b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
241
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
242 :
29310
c35891e664af replace "g" asm constraint by "erm" since "g" allows 64bit immediates while
gpoirier
parents: 29263
diff changeset
243 : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
244 "m" (log2_scale), "m" (src), "m" (dst) //input
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
245 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
246 );
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
247 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
248
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
249 //This func reads from 2 slices, 0 & 2 and clears 2-nd
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
250 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
251 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
252 const uint8_t *od=&dither[0][0];
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
253 const uint8_t *end=&dither[height][0];
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
254 width = (width+7)&~7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
255 dst_stride-=width;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
256 //src_stride=(src_stride-width)*2;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
257 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
258 "mov %5, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
259 "mov %6, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
260 "mov %7, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
261 "mov %1, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
262 "movd %%"REG_d", %%mm5 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
263 "xor $-1, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
264 "mov %%"REG_a", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
265 "add $7, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
266 "sub %0, %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
267 "add %%"REG_c", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
268 "movd %%"REG_d", %%mm2 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
269 "mov %%"REG_c", %1 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
270 "mov %2, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
271 "shl $5, %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
272
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
273 "2: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
274 "movq (%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
275 "movq %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
276 "pxor %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
277 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
278 "punpckhbw %%mm7, %%mm4 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
279 "mov %0, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
280 "psraw %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
281 "psraw %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
282 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
283 "movq (%%"REG_S"), %%mm0 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
284 "movq 8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
285 "paddw %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
286
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
287 "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
288 "paddw %%mm4, %%mm1 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
289 "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
290
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
291 "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
292 "psraw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
293 "paddw %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
294
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
295 "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
296 "psraw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
297 "packuswb %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
298
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
299 "movq %%mm0, (%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
300 "add $16, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
301 "add $8, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
302 "sub $8, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
303 "jg 1b \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
304 "add %1, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
305 "add $8, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
306 "add %3, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
307 "cmp %4, %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
308 "jl 2b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
309
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
310 :
29310
c35891e664af replace "g" asm constraint by "erm" since "g" allows 64bit immediates while
gpoirier
parents: 29263
diff changeset
311 : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
312 "m" (log2_scale), "m" (src), "m" (dst) //input
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
313 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
314 );
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
315 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
316
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
317 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
318 {
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
319 uint64_t *adr=&p->threshold_mtx_noq[0];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
320 __asm__ volatile(
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
321 "movd %0, %%mm7 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
322 "add $8*8*2, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
323 "movq 0*8(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
324 "punpcklwd %%mm7, %%mm7 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
325 "movq 1*8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
326 "punpckldq %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
327 "pmullw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
328
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
329 "movq 2*8(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
330 "pmullw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
331
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
332 "movq 3*8(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
333 "pmullw %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
334
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
335 "movq %%mm0, 0*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
336 "movq 4*8(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
337 "pmullw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
338
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
339 "movq %%mm1, 1*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
340 "movq 5*8(%%"REG_S"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
341 "pmullw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
342
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
343 "movq %%mm2, 2*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
344 "movq 6*8(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
345 "pmullw %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
346
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
347 "movq %%mm3, 3*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
348 "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
349 "pmullw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
350
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
351 "movq %%mm4, 4*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
352 "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
353 "pmullw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
354
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
355 "movq %%mm5, 5*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
356 "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
357 "pmullw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
358
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
359 "movq %%mm6, 6*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
360 "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
361 "pmullw %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
362
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
363 "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
364 "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
365 "pmullw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
366
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
367 "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
368 "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
369 "pmullw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
370
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
371 "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
372 "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
373 "pmullw %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
374
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
375 "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
376 "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
377 "pmullw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
378
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
379 "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
380 "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
381 "pmullw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
382
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
383 "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
384 "pmullw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
385
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
386 "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
387 "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
388 "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
389
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
390 : "+g" (q), "+S" (adr), "+D" (adr)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
391 :
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
392 );
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
393 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
394
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
395 static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
396 static void row_idct_mmx(DCTELEM* workspace,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
397 int16_t* output_adr, int output_stride, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
398 static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
399
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
400 #define store_slice_s store_slice_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
401 #define store_slice2_s store_slice2_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
402 #define mul_thrmat_s mul_thrmat_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
403 #define column_fidct_s column_fidct_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
404 #define row_idct_s row_idct_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
405 #define row_fdct_s row_fdct_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
406 #endif // HAVE_MMX
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
407
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
408 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
409 int dst_stride, int src_stride,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
410 int width, int height,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
411 uint8_t *qp_store, int qp_stride, int is_luma)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
412 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
413 int x, x0, y, es, qy, t;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
414 const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
415 const int step=6-p->log2_count;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
416 const int qps= 3 + is_luma;
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
417 int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
418 DCTELEM *block= (DCTELEM *)block_align;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
419 DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
420
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
421 memset(block3, 0, 4*8*BLOCKSZ);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
422
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
423 //p->src=src-src_stride*8-8;//!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
424 if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
425 for(y=0; y<height; y++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
426 int index= 8 + 8*stride + y*stride;
23457
a124f3abc1ec Replace implicit use of fast_memcpy via macro by explicit use to allow
reimar
parents: 21578
diff changeset
427 fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
428 for(x=0; x<8; x++){
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
429 p->src[index - x - 1]= p->src[index + x ];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
430 p->src[index + width + x ]= p->src[index + width - x - 1];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
431 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
432 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
433 for(y=0; y<8; y++){
23457
a124f3abc1ec Replace implicit use of fast_memcpy via macro by explicit use to allow
reimar
parents: 21578
diff changeset
434 fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
a124f3abc1ec Replace implicit use of fast_memcpy via macro by explicit use to allow
reimar
parents: 21578
diff changeset
435 fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
436 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
437 //FIXME (try edge emu)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
438
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
439 for(y=8; y<24; y++)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
440 memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
441
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
442 for(y=step; y<height+8; y+=step){ //step= 1,2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
443 qy=y-4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
444 if (qy>height-1) qy=height-1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
445 if (qy<0) qy=0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
446 qy=(qy>>qps)*qp_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
447 row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
448 for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
449 row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
450 if(p->qp)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
451 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
452 else
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
453 for (x=0; x<8*(BLOCKSZ-1); x+=8) {
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
454 t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
455 if (t<0) t=0;//t always < width-2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
456 t=qp_store[qy+(t>>qps)];
30412
41fb4acf3df6 Support more qscale types in most post-processing filters.
reimar
parents: 30363
diff changeset
457 t=norm_qscale(t, p->mpeg2);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
458 if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
459 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
460 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
461 row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
25568
707b810a2558 fix artifacts in -vf fspp. regression in r23476.
lorenm
parents: 25221
diff changeset
462 memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); //cycling
707b810a2558 fix artifacts in -vf fspp. regression in r23476.
lorenm
parents: 25221
diff changeset
463 memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
464 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
465 //
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
466 es=width+8-x0; // 8, ...
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
467 if (es>8)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
468 row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
21578
9345eb2d8c8f count needs to be even
henry
parents: 20585
diff changeset
469 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
470 row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
471 {const int y1=y-8+step;//l5-7 l4-6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
472 if (!(y1&7) && y1) {
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
473 if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
474 dst_stride, stride, width, 8, 5-p->log2_count);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
475 else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
476 dst_stride, stride, width, 8, 5-p->log2_count);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
477 } }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
478 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
479
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
480 if (y&7) { // == height & 7
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
481 if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
482 dst_stride, stride, width, y&7, 5-p->log2_count);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
483 else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
484 dst_stride, stride, width, y&7, 5-p->log2_count);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
485 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
486 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
487
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
488 static int config(struct vf_instance *vf,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
489 int width, int height, int d_width, int d_height,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
490 unsigned int flags, unsigned int outfmt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
491 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
492 int h= (height+16+15)&(~15);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
493
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
494 vf->priv->temp_stride= (width+16+15)&(~15);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
495 vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
496 //this can also be avoided, see above
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
497 vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
498
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
499 return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
500 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
501
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
502 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
503 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
504 if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
505 // ok, we can do pp in-place (or pp disabled):
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
506 vf->dmpi=vf_get_image(vf->next,mpi->imgfmt,
16018
bdf1b4ecb906 use stored dimensions instead of visible one when (vf_)get_image is called
iive
parents: 15651
diff changeset
507 mpi->type, mpi->flags, mpi->width, mpi->height);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
508 mpi->planes[0]=vf->dmpi->planes[0];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
509 mpi->stride[0]=vf->dmpi->stride[0];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
510 mpi->width=vf->dmpi->width;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
511 if(mpi->flags&MP_IMGFLAG_PLANAR){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
512 mpi->planes[1]=vf->dmpi->planes[1];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
513 mpi->planes[2]=vf->dmpi->planes[2];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
514 mpi->stride[1]=vf->dmpi->stride[1];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
515 mpi->stride[2]=vf->dmpi->stride[2];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
516 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
517 mpi->flags|=MP_IMGFLAG_DIRECT;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
518 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
519
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
520 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
521 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
522 mp_image_t *dmpi;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
523 if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
524 // no DR, so get a new image! hope we'll get DR buffer:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
525 dmpi=vf_get_image(vf->next,mpi->imgfmt,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
526 MP_IMGTYPE_TEMP,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
527 MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
16018
bdf1b4ecb906 use stored dimensions instead of visible one when (vf_)get_image is called
iive
parents: 15651
diff changeset
528 mpi->width,mpi->height);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
529 vf_clone_mpi_attributes(dmpi, mpi);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
530 }else{
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
531 dmpi=vf->dmpi;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
532 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
533
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
534 vf->priv->mpeg2= mpi->qscale_type;
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
535 if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
30363
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
536 int w = mpi->qstride;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
537 int h = (mpi->h + 15) >> 4;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
538 if (!w) {
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
539 w = (mpi->w + 15) >> 4;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
540 h = 1;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
541 }
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
542 if(!vf->priv->non_b_qp)
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
543 vf->priv->non_b_qp= malloc(w*h);
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
544 fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
545 }
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
546 if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
547 char *qp_tab= vf->priv->non_b_qp;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
548 if(vf->priv->bframes || !qp_tab)
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
549 qp_tab= mpi->qscale;
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
550
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
551 if(qp_tab || vf->priv->qp){
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
552 filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
553 mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
554 filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
555 mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
556 filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
557 mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
558 }else{
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
559 memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
560 memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
561 memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
562 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
563 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
564
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
565 #if HAVE_MMX
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
566 if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
567 #endif
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
568 #if HAVE_MMX2
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
569 if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
570 #endif
17906
20aca9baf5d8 passing pts through the filter layer (lets see if pts or cola comes out at the end)
michael
parents: 17523
diff changeset
571 return vf_next_put_image(vf,dmpi, pts);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
572 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
573
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
574 static void uninit(struct vf_instance *vf)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
575 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
576 if(!vf->priv) return;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
577
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
578 if(vf->priv->temp) av_free(vf->priv->temp);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
579 vf->priv->temp= NULL;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
580 if(vf->priv->src) av_free(vf->priv->src);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
581 vf->priv->src= NULL;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
582 //if(vf->priv->avctx) free(vf->priv->avctx);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
583 //vf->priv->avctx= NULL;
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
584 if(vf->priv->non_b_qp) free(vf->priv->non_b_qp);
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
585 vf->priv->non_b_qp= NULL;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
586
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
587 av_free(vf->priv);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
588 vf->priv=NULL;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
589 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
590
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
591 //===========================================================================//
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
592
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
593 static int query_format(struct vf_instance *vf, unsigned int fmt)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
594 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
595 switch(fmt){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
596 case IMGFMT_YVU9:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
597 case IMGFMT_IF09:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
598 case IMGFMT_YV12:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
599 case IMGFMT_I420:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
600 case IMGFMT_IYUV:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
601 case IMGFMT_CLPL:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
602 case IMGFMT_Y800:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
603 case IMGFMT_Y8:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
604 case IMGFMT_444P:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
605 case IMGFMT_422P:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
606 case IMGFMT_411P:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
607 return vf_next_query_format(vf,fmt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
608 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
609 return 0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
610 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
611
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
612 static int control(struct vf_instance *vf, int request, void* data)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
613 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
614 switch(request){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
615 case VFCTRL_QUERY_MAX_PP_LEVEL:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
616 return 5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
617 case VFCTRL_SET_PP_LEVEL:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
618 vf->priv->log2_count= *((unsigned int*)data);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
619 if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
620 return CONTROL_TRUE;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
621 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
622 return vf_next_control(vf,request,data);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
623 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
624
30638
a7b908875c14 Rename open() vf initialization function to vf_open().
diego
parents: 30412
diff changeset
625 static int vf_open(vf_instance_t *vf, char *args)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
626 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
627 int i=0, bias;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
628 int custom_threshold_m[64];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
629 int log2c=-1;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
630
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
631 vf->config=config;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
632 vf->put_image=put_image;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
633 vf->get_image=get_image;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
634 vf->query_format=query_format;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
635 vf->uninit=uninit;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
636 vf->control= control;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
637 vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
638
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
639 avcodec_init();
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
640
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
641 //vf->priv->avctx= avcodec_alloc_context();
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
642 //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
643
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
644 vf->priv->log2_count= 4;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
645 vf->priv->bframes = 0;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
646
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
647 if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
648
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
649 if( log2c >=4 && log2c <=5 )
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
650 vf->priv->log2_count = log2c;
15651
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
651 else if( log2c >= 6 )
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
652 vf->priv->log2_count = 5;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
653
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
654 if(vf->priv->qp < 0)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
655 vf->priv->qp = 0;
15651
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
656
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
657 if (i < -15) i = -15;
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
658 if (i > 32) i = 32;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
659
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
660 bias= (1<<4)+i; //regulable
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
661 vf->priv->prev_q=0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
662 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
663 for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
664 custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
665 for(i=0;i<8;i++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
666 vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
667 |(((uint64_t)custom_threshold_m[i*8+6])<<16)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
668 |(((uint64_t)custom_threshold_m[i*8+0])<<32)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
669 |(((uint64_t)custom_threshold_m[i*8+4])<<48);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
670 vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
671 |(((uint64_t)custom_threshold_m[i*8+3])<<16)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
672 |(((uint64_t)custom_threshold_m[i*8+1])<<32)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
673 |(((uint64_t)custom_threshold_m[i*8+7])<<48);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
674 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
675
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
676 if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
677
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
678 return 1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
679 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
680
25221
00fff9a3b735 Make all vf_info_t structs const
reimar
parents: 24976
diff changeset
681 const vf_info_t vf_info_fspp = {
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
682 "fast simple postprocess",
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
683 "fspp",
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
684 "Michael Niedermayer, Nikolaj Poroshin",
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
685 "",
30638
a7b908875c14 Rename open() vf initialization function to vf_open().
diego
parents: 30412
diff changeset
686 vf_open,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
687 NULL
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
688 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
689
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
690 //====================================================================
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
691 //Specific spp's dct, idct and threshold functions
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
692 //I'd prefer to have them in the separate file.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
693
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
694 //#define MANGLE(a) #a
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
695
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
696 //typedef int16_t DCTELEM; //! only int16_t
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
697
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
698 #define DCTSIZE 8
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
699 #define DCTSIZE_S "8"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
700
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
701 #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
702 #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
703 #define FIX64(x,s) C64(FIX(x,s))
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
704
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
705 #define MULTIPLY16H(x,k) (((x)*(k))>>16)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
706 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
707 #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
708
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
709 #if HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
710
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
711 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
712 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_541196100)=FIX64(0.541196100, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_707106781)=FIX64(0.707106781, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
714 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
715
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
716 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
717
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
718 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
25901
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
719 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
25901
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
722 //for t3,t5,t7 == 0 shortcut
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
723 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
724 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
726
25901
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
727 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
728 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
729
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
730 #else /* !HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
731
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
732 typedef int32_t int_simd16_t;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
733 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
734 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
735 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
736 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
737 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
738 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
25902
15ab840747e2 mark constants as such
reimar
parents: 25901
diff changeset
739 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
740 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
25902
15ab840747e2 mark constants as such
reimar
parents: 25901
diff changeset
741 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
742
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
743 #endif
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
744
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
745 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
746
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
747 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
748 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
749 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
750 int_simd16_t tmp10, tmp11, tmp12, tmp13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
751 int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
752 int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
753
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
754 DCTELEM* dataptr;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
755 DCTELEM* wsptr;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
756 int16_t *threshold;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
757 int ctr;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
758
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
759 dataptr = data;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
760 wsptr = output;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
761
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
762 for (; cnt > 0; cnt-=2) { //start positions
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
763 threshold=(int16_t*)thr_adr;//threshold_mtx
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
764 for (ctr = DCTSIZE; ctr > 0; ctr--) {
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
765 // Process columns from input, add to output.
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
766 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
767 tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
768
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
769 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
770 tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
771
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
772 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
773 tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
774
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
775 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
776 tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
777
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
778 // Even part of FDCT
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
779
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
780 tmp10 = tmp0 + tmp3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
781 tmp13 = tmp0 - tmp3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
782 tmp11 = tmp1 + tmp2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
783 tmp12 = tmp1 - tmp2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
784
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
785 d0 = tmp10 + tmp11;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
786 d4 = tmp10 - tmp11;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
787
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
788 z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
789 d2 = tmp13 + z1;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
790 d6 = tmp13 - z1;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
791
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
792 // Even part of IDCT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
793
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
794 THRESHOLD(tmp0, d0, threshold[0*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
795 THRESHOLD(tmp1, d2, threshold[2*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
796 THRESHOLD(tmp2, d4, threshold[4*8]);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
797 THRESHOLD(tmp3, d6, threshold[6*8]);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
798 tmp0+=2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
799 tmp10 = (tmp0 + tmp2)>>2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
800 tmp11 = (tmp0 - tmp2)>>2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
801
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
802 tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
803 tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
804
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
805 tmp0 = tmp10 + tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
806 tmp3 = tmp10 - tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
807 tmp1 = tmp11 + tmp12; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
808 tmp2 = tmp11 - tmp12; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
809
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
810 // Odd part of FDCT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
811
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
812 tmp10 = tmp4 + tmp5;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
813 tmp11 = tmp5 + tmp6;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
814 tmp12 = tmp6 + tmp7;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
815
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
816 z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
817 z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
818 z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
819 z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
820
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
821 z11 = tmp7 + z3;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
822 z13 = tmp7 - z3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
823
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
824 d5 = z13 + z2;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
825 d3 = z13 - z2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
826 d1 = z11 + z4;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
827 d7 = z11 - z4;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
828
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
829 // Odd part of IDCT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
830
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
831 THRESHOLD(tmp4, d1, threshold[1*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
832 THRESHOLD(tmp5, d3, threshold[3*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
833 THRESHOLD(tmp6, d5, threshold[5*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
834 THRESHOLD(tmp7, d7, threshold[7*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
835
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
836 //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
837 z13 = tmp6 + tmp5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
838 z10 = (tmp6 - tmp5)<<1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
839 z11 = tmp4 + tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
840 z12 = (tmp4 - tmp7)<<1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
841
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
842 tmp7 = (z11 + z13)>>2; //+2 !
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
843 tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
844 z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
845 tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
846 tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
847
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
848 tmp6 = tmp12 - tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
849 tmp5 = tmp11 - tmp6;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
850 tmp4 = tmp10 + tmp5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
851
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
852 wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
853 wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
854 wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
855 wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
856 wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
857 wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
858 wsptr[DCTSIZE*6]= (tmp1 - tmp6);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
859 wsptr[DCTSIZE*7]= (tmp0 - tmp7);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
860 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
861 dataptr++; //next column
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
862 wsptr++;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
863 threshold++;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
864 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
865 dataptr+=8; //skip each second start pos
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
866 wsptr +=8;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
867 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
868 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
869
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
870 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
871
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
872 static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
873 {
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
874 uint64_t __attribute__((aligned(8))) temps[4];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
875 __asm__ volatile(
19372
6334c14b38eb Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents: 18131
diff changeset
876 ASMALIGN(4)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
877 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
878 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
879 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
880 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
881 "movq %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
882
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
883 "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
884 "movq %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
885
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
886 "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
887 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
888
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
889 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
890 "psubw %%mm7, %%mm1 \n\t" //t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
891
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
892 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
893 "movq %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
894
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
895 "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
896 "paddw %%mm7, %%mm5 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
897
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
898 "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
899 "movq %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
900
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
901 "paddw %%mm2, %%mm6 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
902 "psubw %%mm2, %%mm7 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
903
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
904 "movq %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
905 "paddw %%mm6, %%mm5 \n\t" //d0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
906 // i0 t13 t12 i3 i1 d0 - d4
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
907 "psubw %%mm6, %%mm2 \n\t" //d4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
908 "paddw %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
909
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
910 "movq 4*16(%%"REG_d"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
911 "psllw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
912
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
913 "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
914 "psubw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
915
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
916 "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
917 "paddusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
918
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
919 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
920 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
921 "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
922 "paddw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
923
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
924 "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
925 "psubusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
926
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
927 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
928 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
929 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
930 "paddw "MANGLE(MM_2)", %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
931 "movq %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
932
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
933 "paddw %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
934 "psubw %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
935
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
936 "movq %%mm1, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
937 "paddw %%mm7, %%mm1 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
938
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
939 "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
940 "psubw %%mm7, %%mm6 \n\t" //d6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
941
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
942 "movq 6*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
943 "psraw $2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
944
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
945 "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
946 "psubw %%mm7, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
947 // t7 d2 /t11 t4 t6 - d6 /t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
948
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
949 "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
950 "paddusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
951
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
952 "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
953 "paddw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
954
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
955 "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
956 "psubusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
957
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
958 //movq [edi+"DCTSIZE_S"*2*2], mm1
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
959 //movq [edi+"DCTSIZE_S"*6*2], mm6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
960 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
961 "psraw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
962
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
963 "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
964 "psubw %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
965
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
966 "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
967 "paddw %%mm7, %%mm6 \n\t" //'t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
968
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
969 "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
970 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
971
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
972 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
973 "paddw %%mm6, %%mm2 \n\t" //'t0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
974
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
975 "movq %%mm2, 0*8+%3 \n\t" //!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
976 "psubw %%mm6, %%mm7 \n\t" //'t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
977
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
978 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
979 "psubw %%mm6, %%mm1 \n\t" //'t12
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
980
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
981 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
982 "movq %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
983
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
984 "movq %%mm7, 3*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
985 "paddw %%mm2, %%mm3 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
986
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
987 "paddw %%mm4, %%mm2 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
988 "paddw %%mm0, %%mm4 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
989
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
990 "movq %%mm3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
991 "psubw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
992
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
993 "psllw $2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
994 "psllw $2, %%mm7 \n\t" //opt for P6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
995
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
996 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
997 "psllw $2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
998
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
999 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1000 "psllw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1001
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1002 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1003 "paddw %%mm1, %%mm5 \n\t" //'t1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1004
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1005 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1006 "psubw %%mm1, %%mm6 \n\t" //'t2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1007 // t7 't12 't11 t4 t6 - 't13 't10 ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1008
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1009 "paddw %%mm3, %%mm7 \n\t" //z2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1010
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1011 "movq %%mm5, 1*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1012 "paddw %%mm3, %%mm4 \n\t" //z4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1013
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1014 "movq 3*16(%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1015 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1016
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1017 "movq %%mm6, 2*8+%3 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1018 "psubw %%mm2, %%mm1 \n\t" //z13
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1019
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1020 //===
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1021 "paddw %%mm2, %%mm0 \n\t" //z11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1022 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1023
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1024 "movq 5*16(%%"REG_d"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1025 "psubw %%mm7, %%mm1 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1026
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1027 "paddw %%mm7, %%mm5 \n\t" //d5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1028 "psubw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1029
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1030 "movq 1*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1031 "psubw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1032
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1033 "movq %%mm0, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1034 "paddw %%mm4, %%mm0 \n\t" //d1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1035
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1036 "paddusw %%mm3, %%mm1 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1037 "psubw %%mm4, %%mm6 \n\t" //d7
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1038
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1039 // d1 d3 - - - d5 d7 -
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1040 "movq 7*16(%%"REG_d"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1041 "psubw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1042
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1043 "psubw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1044 "paddusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1045
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1046 "paddusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1047 "paddw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1048
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1049 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1050 "paddw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1051
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1052 "psubusw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1053 "psubusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1054
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1055 "psubusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1056 "movq %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1057
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1058 "por %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1059 "paddusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1060
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1061 "por %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1062 "paddw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1063
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1064 "packssdw %%mm4, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1065 "psubusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1066
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1067 "movd %%mm4, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1068 "or %%"REG_a", %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1069 "jnz 2f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1070 //movq [edi+"DCTSIZE_S"*3*2], mm1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1071 //movq [edi+"DCTSIZE_S"*5*2], mm5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1072 //movq [edi+"DCTSIZE_S"*1*2], mm0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1073 //movq [edi+"DCTSIZE_S"*7*2], mm6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1074 // t4 t5 - - - t6 t7 -
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1075 //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1076 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1077 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1078 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1079
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1080 "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1081 "movq %%mm1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1082
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1083 "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1084 "movq %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1085
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1086 "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1087 "paddw %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1088
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1089 "movq 1*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1090 //paddw mm3, MM_2
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1091 "psraw $2, %%mm3 \n\t" //tmp7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1092
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1093 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1094 "psubw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1095
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1096 "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1097 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1098
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1099 "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1100 "paddw %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1101
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1102 "movq 2*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1103 "psubw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1104
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1105 "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1106 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1107
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1108 "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1109 "paddw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1110
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1111 "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1112 "psubw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1113
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1114 "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1115 "paddw %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1116
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1117 "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1118 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1119
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1120 "movq 3*8+%3, %%mm0 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1121 "add $8, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1122
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1123 "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1124 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1125
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1126 "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1127 "psubw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1128
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1129 "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1130 "paddw %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1131
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1132 "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1133 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1134
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1135 "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1136
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1137 "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1138 "add $8, %%"REG_D" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1139 "jmp 4f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1140
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1141 "2: \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1142 //--- non DC2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1143 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1144 //psraw mm5, 2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1145 //psraw mm0, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1146 //psraw mm6, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1147 "movq %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1148 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1149
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1150 "psllw $1, %%mm5 \n\t" //'z10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1151 "paddw %%mm1, %%mm3 \n\t" //'z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1152
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1153 "movq %%mm0, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1154 "psubw %%mm6, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1155
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1156 "movq %%mm5, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1157 "psllw $1, %%mm0 \n\t" //'z12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1158
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1159 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1160 "paddw %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1161
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1162 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1163 "paddw %%mm6, %%mm2 \n\t" //'z11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1164
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1165 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1166 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1167
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1168 //---
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1169 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1170 "psubw %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1171
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1172 "psllw $1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1173 "paddw %%mm3, %%mm7 \n\t" //'t7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1174
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1175 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1176 "movq %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1177 //paddw mm7, MM_2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1178 "psraw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1179
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1180 "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1181 "psubw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1182
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1183 "movq 1*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1184 "paddw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1185
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1186 "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1187 "paddw %%mm5, %%mm1 \n\t" //'t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1188
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1189 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1190 "psubw %%mm7, %%mm1 \n\t" //'t6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1191
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1192 "movq 2*8+%3, %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1193 "psubw %%mm5, %%mm0 \n\t" //'t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1194
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1195 "movq 3*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1196 "movq %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1197
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1198 "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1199 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1200
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1201 "psubw %%mm1, %%mm2 \n\t" //'t5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1202 "paddw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1203
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1204 "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1205 "movq %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1206
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1207 "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1208 "psubw %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1209
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1210 "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1211 "paddw %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1212
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1213 "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1214 "paddw %%mm2, %%mm0 \n\t" //'t4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1215
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1216 // 't4 't6 't5 - - - - 't7
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1217 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1218 "movq %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1219
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1220 "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1221 "psubw %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1222
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1223 "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1224 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1225
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1226 "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1227 "add $8, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1228
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1229 "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1230
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1231 "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1232 "add $8, %%"REG_D" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1233
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1234 "4: \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1235 //=part 2 (the same)===========================================================
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1236 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1237 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1238 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1239 "movq %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1240
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1241 "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1242 "movq %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1243
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1244 "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1245 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1246
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1247 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1248 "psubw %%mm7, %%mm1 \n\t" //t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1249
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1250 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1251 "movq %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1252
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1253 "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1254 "paddw %%mm7, %%mm5 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1255
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1256 "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1257 "movq %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1258
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1259 "paddw %%mm2, %%mm6 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1260 "psubw %%mm2, %%mm7 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1261
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1262 "movq %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1263 "paddw %%mm6, %%mm5 \n\t" //d0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1264 // i0 t13 t12 i3 i1 d0 - d4
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1265 "psubw %%mm6, %%mm2 \n\t" //d4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1266 "paddw %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1267
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1268 "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1269 "psllw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1270
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1271 "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1272 "psubw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1273
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1274 "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1275 "paddusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1276
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1277 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1278 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1279 "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1280 "paddw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1281
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1282 "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1283 "psubusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1284
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1285 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1286 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1287 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1288 "paddw "MANGLE(MM_2)", %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1289 "movq %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1290
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1291 "paddw %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1292 "psubw %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1293
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1294 "movq %%mm1, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1295 "paddw %%mm7, %%mm1 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1296
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1297 "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1298 "psubw %%mm7, %%mm6 \n\t" //d6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1299
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1300 "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1301 "psraw $2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1302
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1303 "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1304 "psubw %%mm7, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1305 // t7 d2 /t11 t4 t6 - d6 /t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1306
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1307 "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1308 "paddusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1309
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1310 "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1311 "paddw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1312
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1313 "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1314 "psubusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1315
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1316 //movq [edi+"DCTSIZE_S"*2*2], mm1
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1317 //movq [edi+"DCTSIZE_S"*6*2], mm6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1318 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1319 "psraw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1320
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1321 "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1322 "psubw %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1323
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1324 "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1325 "paddw %%mm7, %%mm6 \n\t" //'t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1326
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1327 "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1328 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1329
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1330 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1331 "paddw %%mm6, %%mm2 \n\t" //'t0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1332
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1333 "movq %%mm2, 0*8+%3 \n\t" //!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1334 "psubw %%mm6, %%mm7 \n\t" //'t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1335
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1336 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1337 "psubw %%mm6, %%mm1 \n\t" //'t12
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1338
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1339 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1340 "movq %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1341
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1342 "movq %%mm7, 3*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1343 "paddw %%mm2, %%mm3 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1344
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1345 "paddw %%mm4, %%mm2 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1346 "paddw %%mm0, %%mm4 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1347
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1348 "movq %%mm3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1349 "psubw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1350
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1351 "psllw $2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1352 "psllw $2, %%mm7 \n\t" //opt for P6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1353
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1354 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1355 "psllw $2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1356
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1357 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1358 "psllw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1359
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1360 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1361 "paddw %%mm1, %%mm5 \n\t" //'t1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1362
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1363 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1364 "psubw %%mm1, %%mm6 \n\t" //'t2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1365 // t7 't12 't11 t4 t6 - 't13 't10 ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1366
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1367 "paddw %%mm3, %%mm7 \n\t" //z2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1368
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1369 "movq %%mm5, 1*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1370 "paddw %%mm3, %%mm4 \n\t" //z4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1371
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1372 "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1373 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1374
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1375 "movq %%mm6, 2*8+%3 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1376 "psubw %%mm2, %%mm1 \n\t" //z13
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1377
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1378 //===
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1379 "paddw %%mm2, %%mm0 \n\t" //z11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1380 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1381
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1382 "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1383 "psubw %%mm7, %%mm1 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1384
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1385 "paddw %%mm7, %%mm5 \n\t" //d5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1386 "psubw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1387
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1388 "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1389 "psubw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1390
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1391 "movq %%mm0, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1392 "paddw %%mm4, %%mm0 \n\t" //d1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1393
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1394 "paddusw %%mm3, %%mm1 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1395 "psubw %%mm4, %%mm6 \n\t" //d7
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1396
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1397 // d1 d3 - - - d5 d7 -
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1398 "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1399 "psubw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1400
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1401 "psubw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1402 "paddusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1403
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1404 "paddusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1405 "paddw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1406
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1407 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1408 "paddw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1409
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1410 "psubusw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1411 "psubusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1412
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1413 "psubusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1414 "movq %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1415
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1416 "por %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1417 "paddusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1418
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1419 "por %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1420 "paddw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1421
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1422 "packssdw %%mm4, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1423 "psubusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1424
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1425 "movd %%mm4, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1426 "or %%"REG_a", %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1427 "jnz 3f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1428 //movq [edi+"DCTSIZE_S"*3*2], mm1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1429 //movq [edi+"DCTSIZE_S"*5*2], mm5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1430 //movq [edi+"DCTSIZE_S"*1*2], mm0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1431 //movq [edi+"DCTSIZE_S"*7*2], mm6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1432 // t4 t5 - - - t6 t7 -
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1433 //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1434 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1435 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1436 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1437
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1438 "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1439 "movq %%mm1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1440
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1441 "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1442 "movq %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1443
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1444 "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1445 "paddw %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1446
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1447 "movq 1*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1448 //paddw mm3, MM_2
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1449 "psraw $2, %%mm3 \n\t" //tmp7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1450
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1451 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1452 "psubw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1453
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1454 "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1455 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1456
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1457 "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1458 "paddw %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1459
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1460 "movq 2*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1461 "psubw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1462
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1463 "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1464 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1465
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1466 "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1467 "paddw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1468
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1469 "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1470 "psubw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1471
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1472 "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1473 "paddw %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1474
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1475 "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1476 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1477
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1478 "movq 3*8+%3, %%mm0 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1479 "add $24, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1480
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1481 "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1482 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1483
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1484 "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1485 "psubw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1486
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1487 "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1488 "paddw %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1489
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1490 "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1491 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1492
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1493 "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1494
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1495 "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1496 "add $24, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1497 "sub $2, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1498 "jnz 1b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1499 "jmp 5f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1500
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1501 "3: \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1502 //--- non DC2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1503 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1504 //psraw mm5, 2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1505 //psraw mm0, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1506 //psraw mm6, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1507 "movq %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1508 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1509
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1510 "psllw $1, %%mm5 \n\t" //'z10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1511 "paddw %%mm1, %%mm3 \n\t" //'z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1512
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1513 "movq %%mm0, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1514 "psubw %%mm6, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1515
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1516 "movq %%mm5, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1517 "psllw $1, %%mm0 \n\t" //'z12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1518
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1519 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1520 "paddw %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1521
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1522 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1523 "paddw %%mm6, %%mm2 \n\t" //'z11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1524
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1525 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1526 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1527
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1528 //---
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1529 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1530 "psubw %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1531
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1532 "psllw $1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1533 "paddw %%mm3, %%mm7 \n\t" //'t7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1534
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1535 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1536 "movq %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1537 //paddw mm7, MM_2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1538 "psraw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1539
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1540 "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1541 "psubw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1542
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1543 "movq 1*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1544 "paddw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1545
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1546 "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1547 "paddw %%mm5, %%mm1 \n\t" //'t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1548
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1549 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1550 "psubw %%mm7, %%mm1 \n\t" //'t6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1551
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1552 "movq 2*8+%3, %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1553 "psubw %%mm5, %%mm0 \n\t" //'t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1554
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1555 "movq 3*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1556 "movq %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1557
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1558 "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1559 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1560
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1561 "psubw %%mm1, %%mm2 \n\t" //'t5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1562 "paddw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1563
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1564 "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1565 "movq %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1566
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1567 "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1568 "psubw %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1569
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1570 "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1571 "paddw %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1572
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1573 "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1574 "paddw %%mm2, %%mm0 \n\t" //'t4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1575
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1576 // 't4 't6 't5 - - - - 't7
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1577 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1578 "movq %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1579
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1580 "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1581 "psubw %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1582
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1583 "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1584 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1585
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1586 "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1587 "add $24, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1588
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1589 "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1590
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1591 "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1592 "add $24, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1593 "sub $2, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1594 "jnz 1b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1595 "5: \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1596
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1597 : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
15632
e813a3e431a8 move unchanged registers back to input spec
henry
parents: 15631
diff changeset
1598 : "d"(thr_adr)
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1599 : "%"REG_a
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1600 );
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1601 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1602
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1603 #endif // HAVE_MMX
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1604
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
1605 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1606
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1607 static void row_idct_c(DCTELEM* workspace,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1608 int16_t* output_adr, int output_stride, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1609 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1610 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1611 int_simd16_t tmp10, tmp11, tmp12, tmp13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1612 int_simd16_t z5, z10, z11, z12, z13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1613 int16_t* outptr;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1614 DCTELEM* wsptr;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1615
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1616 cnt*=4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1617 wsptr = workspace;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1618 outptr = output_adr;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1619 for (; cnt > 0; cnt--) {
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1620 // Even part
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1621 //Simd version reads 4x4 block and transposes it
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1622 tmp10 = ( wsptr[2] + wsptr[3]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1623 tmp11 = ( wsptr[2] - wsptr[3]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1624
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1625 tmp13 = ( wsptr[0] + wsptr[1]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1626 tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1627
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1628 tmp0 = tmp10 + tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1629 tmp3 = tmp10 - tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1630 tmp1 = tmp11 + tmp12;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1631 tmp2 = tmp11 - tmp12;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1632
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1633 // Odd part
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1634 //Also transpose, with previous:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1635 // ---- ---- ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1636 // ---- ---- idct ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1637 // ---- ---- ---> ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1638 // ---- ---- ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1639 z13 = wsptr[4] + wsptr[5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1640 z10 = wsptr[4] - wsptr[5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1641 z11 = wsptr[6] + wsptr[7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1642 z12 = wsptr[6] - wsptr[7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1643
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1644 tmp7 = z11 + z13;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1645 tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1646
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1647 z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1648 tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1649 tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1650
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1651 tmp6 = (tmp12<<3) - tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1652 tmp5 = (tmp11<<3) - tmp6;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1653 tmp4 = (tmp10<<3) + tmp5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1654
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1655 // Final output stage: descale and write column
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1656 outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1657 outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1658 outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1659 outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1660 outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1661 outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1662 outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1663 outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1664 outptr++;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1665
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1666 wsptr += DCTSIZE; // advance pointer to next row
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1667 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1668 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1669
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1670 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1671
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1672 static void row_idct_mmx (DCTELEM* workspace,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1673 int16_t* output_adr, int output_stride, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1674 {
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1675 uint64_t __attribute__((aligned(8))) temps[4];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
1676 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1677 "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1678 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1679 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1680 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1681
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1682 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1683 "movq %%mm0, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1684
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1685 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1686 "punpcklwd %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1687
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1688 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1689 "punpckhwd %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1690
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1691 //transpose 4x4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1692 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1693 "punpcklwd %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1694
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1695 "movq %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1696 "punpckldq %%mm2, %%mm0 \n\t" //0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1697
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1698 "punpckhdq %%mm2, %%mm6 \n\t" //1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1699 "movq %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1700
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1701 "punpckhwd %%mm3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1702 "psubw %%mm6, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1703
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1704 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1705 "movq %%mm4, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1706
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1707 "punpckldq %%mm7, %%mm4 \n\t" //2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1708 "paddw %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1709
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1710 "punpckhdq %%mm7, %%mm2 \n\t" //3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1711 "movq %%mm4, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1712
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1713 "psllw $2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1714 "paddw %%mm2, %%mm4 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1715
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1716 "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1717 "psubw %%mm2, %%mm1 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1718
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1719 "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1720 "psubw %%mm5, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1721
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1722 "movq %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1723 "paddw %%mm5, %%mm4 \n\t" //t0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1724
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1725 "psubw %%mm5, %%mm6 \n\t" //t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1726 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1727
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1728 "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1729 "paddw %%mm0, %%mm1 \n\t" //t1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1730
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1731 "movq %%mm4, 0*8+%3 \n\t" //t0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1732 "movq %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1733
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1734 "movq %%mm6, 1*8+%3 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1735 "punpcklwd %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1736
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1737 //transpose 4x4
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1738 "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1739 "punpckhwd %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1740
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1741 "movq %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1742 "punpcklwd %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1743
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1744 "psubw %%mm0, %%mm7 \n\t" //t2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1745 "punpckhwd %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1746
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1747 "movq %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1748 "punpckldq %%mm5, %%mm3 \n\t" //4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1749
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1750 "punpckhdq %%mm5, %%mm0 \n\t" //5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1751 "movq %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1752
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1753 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1754 "movq %%mm3, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1755 "punpckldq %%mm2, %%mm4 \n\t" //6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1756
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1757 "psubw %%mm0, %%mm3 \n\t" //z10
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1758 "punpckhdq %%mm2, %%mm5 \n\t" //7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1759
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1760 "paddw %%mm0, %%mm6 \n\t" //z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1761 "movq %%mm4, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1762
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1763 "movq %%mm3, %%mm0 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1764 "psubw %%mm5, %%mm4 \n\t" //z12
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1765
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1766 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1767 "paddw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1768
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1769 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1770 "paddw %%mm5, %%mm2 \n\t" //z11 >
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1771
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1772 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1773 "movq %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1774
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1775 "psubw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1776 "paddw %%mm6, %%mm5 \n\t" //t7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1777
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1778 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1779 "paddw %%mm3, %%mm0 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1780
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1781 "psllw $3, %%mm0 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1782 "psubw %%mm3, %%mm4 \n\t" //t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1783
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1784 "movq 0*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1785 "movq %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1786
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1787 "psllw $3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1788 "psubw %%mm5, %%mm0 \n\t" //t6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1789
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1790 "psllw $3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1791 "paddw %%mm0, %%mm1 \n\t" //d1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1792
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1793 "psubw %%mm0, %%mm2 \n\t" //t5
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1794 "psubw %%mm0, %%mm3 \n\t" //d6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1795
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1796 "paddw %%mm2, %%mm4 \n\t" //t4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1797 "movq %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1798
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1799 "paddw %%mm2, %%mm7 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1800 "psubw %%mm2, %%mm0 \n\t" //d5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1801
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1802 "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1803 "psubw %%mm5, %%mm6 \n\t" //d7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1804
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1805 "paddw 0*8+%3, %%mm5 \n\t" //d0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1806 "paddw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1807
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1808 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1809 "psraw $3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1810
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1811 "paddw %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1812 "psraw $3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1813
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1814 "paddw (%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1815 "psraw $3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1816
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1817 "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1818 "paddw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1819
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1820 "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1821 "paddw %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1822
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1823 "movq %%mm5, (%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1824 "paddw %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1825
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1826 "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1827 "psraw $3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1828
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1829 "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1830 "add %%"REG_d", %%"REG_D" \n\t" //3*ls
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1831
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1832 "movq 1*8+%3, %%mm5 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1833 "psraw $3, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1834
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1835 "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1836 "psubw %%mm4, %%mm5 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1837
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1838 "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1839 "psraw $3, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1840
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1841 "paddw 1*8+%3, %%mm4 \n\t" //d4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1842 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1843
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1844 "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1845 "paddw %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1846
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1847 "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1848 "psraw $3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1849
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1850 "paddw (%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1851 "psraw $3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1852
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1853 "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1854 "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1855
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1856 "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1857 "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1858 "movq %%mm5, (%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1859 "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1860
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1861 "sub %%"REG_d", %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1862 "add $8, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1863 "dec %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1864 "jnz 1b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1865
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1866 : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1867 : "a"(output_stride*sizeof(short))
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1868 : "%"REG_d
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1869 );
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1870 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1871
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1872 #endif // HAVE_MMX
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1873
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
1874 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1875
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1876 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1877 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1878 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1879 int_simd16_t tmp10, tmp11, tmp12, tmp13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1880 int_simd16_t z1, z2, z3, z4, z5, z11, z13;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1881 DCTELEM *dataptr;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1882
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1883 cnt*=4;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1884 // Pass 1: process rows.
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1885
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1886 dataptr = data;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1887 for (; cnt > 0; cnt--) {
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1888 tmp0 = pixels[line_size*0] + pixels[line_size*7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1889 tmp7 = pixels[line_size*0] - pixels[line_size*7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1890 tmp1 = pixels[line_size*1] + pixels[line_size*6];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1891 tmp6 = pixels[line_size*1] - pixels[line_size*6];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1892 tmp2 = pixels[line_size*2] + pixels[line_size*5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1893 tmp5 = pixels[line_size*2] - pixels[line_size*5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1894 tmp3 = pixels[line_size*3] + pixels[line_size*4];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1895 tmp4 = pixels[line_size*3] - pixels[line_size*4];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1896
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1897 // Even part
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1898
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1899 tmp10 = tmp0 + tmp3;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1900 tmp13 = tmp0 - tmp3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1901 tmp11 = tmp1 + tmp2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1902 tmp12 = tmp1 - tmp2;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1903 //Even columns are written first, this leads to different order of columns
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1904 //in column_fidct(), but they are processed independently, so all ok.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1905 //Later in the row_idct() columns readed at the same order.
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1906 dataptr[2] = tmp10 + tmp11;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1907 dataptr[3] = tmp10 - tmp11;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1908
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1909 z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1910 dataptr[0] = tmp13 + z1;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1911 dataptr[1] = tmp13 - z1;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1912
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1913 // Odd part
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1914
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1915 tmp10 = (tmp4 + tmp5) <<2;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1916 tmp11 = (tmp5 + tmp6) <<2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1917 tmp12 = (tmp6 + tmp7) <<2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1918
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1919 z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1920 z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1921 z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1922 z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1923
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1924 z11 = tmp7 + z3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1925 z13 = tmp7 - z3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1926
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1927 dataptr[4] = z13 + z2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1928 dataptr[5] = z13 - z2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1929 dataptr[6] = z11 + z4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1930 dataptr[7] = z11 - z4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1931
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1932 pixels++; // advance pointer to next column
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1933 dataptr += DCTSIZE;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1934 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1935 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1936
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1937 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1938
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1939 static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1940 {
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1941 uint64_t __attribute__((aligned(8))) temps[4];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
1942 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1943 "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1944 "6: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1945 "movd (%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1946 "pxor %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1947
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1948 "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1949 "punpcklbw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1950
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1951 "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1952 "punpcklbw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1953
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1954 "punpcklbw %%mm7, %%mm2 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1955 "add %%"REG_d", %%"REG_S" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1956
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1957 "movq %%mm0, %%mm5 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1958 //
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1959
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1960 "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1961 "movq %%mm1, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1962
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1963 "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t" //6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1964 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1965
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1966 "psubw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1967 "punpcklbw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1968
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1969 "paddw %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1970 "psubw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1971
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1972 "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1973 "paddw %%mm4, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1974
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1975 "movq %%mm5, 0*8+%3 \n\t" //t7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1976 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1977
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1978 "movq %%mm6, 1*8+%3 \n\t" //t6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1979 "movq %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1980
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1981 "movd (%%"REG_S"), %%mm5 \n\t" //3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1982 "paddw %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1983
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1984 "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t" //4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1985 "punpcklbw %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1986
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1987 "psubw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1988 "punpcklbw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1989
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1990 "movq %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1991 "paddw %%mm6, %%mm5 \n\t" //t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1992
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1993 "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - -
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1994 "movq %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1995
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1996 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1997 "psubw %%mm5, %%mm0 \n\t" //t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1998
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1999 "psubw %%mm2, %%mm1 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2000 "paddw %%mm2, %%mm7 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2001
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2002 "paddw %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2003 "movq %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2004
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2005 "psllw $2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2006 "paddw %%mm5, %%mm6 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2007
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2008 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2009 "paddw %%mm6, %%mm7 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2010
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2011 "psubw %%mm2, %%mm6 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2012 "movq %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2013
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2014 //transpose 4x4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2015 "movq %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2016 "punpcklwd %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2017
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2018 "paddw %%mm1, %%mm0 \n\t" //d0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2019 "punpckhwd %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2020
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2021 "psubw %%mm1, %%mm5 \n\t" //d1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2022 "movq %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2023
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
2024 "movq 1*8+%3, %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2025 "punpcklwd %%mm5, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2026
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2027 "punpckhwd %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2028 "movq %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2029
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2030 "punpckldq %%mm7, %%mm0 \n\t" //0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2031 "paddw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2032
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2033 "punpckhdq %%mm7, %%mm5 \n\t" //1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2034 "movq %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2035
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2036 "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2037 "punpckldq %%mm2, %%mm6 \n\t" //2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2038
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2039 "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2040 "punpckhdq %%mm2, %%mm7 \n\t" //3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2041
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2042 "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2043 "paddw %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2044
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2045 "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2046 "psllw $2, %%mm3 \n\t" //t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2047
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
2048 "movq 0*8+%3, %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2049 "psllw $2, %%mm4 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2050
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2051 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t" //z3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2052 "paddw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2053
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2054 "psllw $2, %%mm1 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2055 "movq %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2056
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2057 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2058 "psubw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2059
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2060 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2061 "movq %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2062
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2063 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2064 "psubw %%mm4, %%mm2 \n\t" //z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2065
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2066 "paddw %%mm4, %%mm5 \n\t" //z11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2067 "movq %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2068
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2069 "paddw %%mm3, %%mm0 \n\t" //z2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2070 "movq %%mm5, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2071
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2072 "paddw %%mm0, %%mm2 \n\t" //d4
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2073 "psubw %%mm0, %%mm6 \n\t" //d5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2074
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2075 "movq %%mm2, %%mm4 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2076 "paddw %%mm3, %%mm1 \n\t" //z4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2077
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2078 //transpose 4x4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2079 "punpcklwd %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2080 "paddw %%mm1, %%mm5 \n\t" //d6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2081
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2082 "punpckhwd %%mm6, %%mm4 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2083 "psubw %%mm1, %%mm7 \n\t" //d7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2084
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2085 "movq %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2086 "punpcklwd %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2087
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2088 "punpckhwd %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2089 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2090
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2091 "punpckldq %%mm5, %%mm2 \n\t" //4
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2092 "sub %%"REG_d", %%"REG_S" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2093
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2094 "punpckhdq %%mm5, %%mm7 \n\t" //5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2095 "movq %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2096
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2097 "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2098 "punpckldq %%mm6, %%mm4 \n\t" //6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2099
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2100 "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2101 "punpckhdq %%mm6, %%mm5 \n\t" //7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2102
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2103 "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2104 "add $4, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2105
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2106 "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2107 "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2108 "dec %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2109 "jnz 6b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2110
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
2111 : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
15632
e813a3e431a8 move unchanged registers back to input spec
henry
parents: 15631
diff changeset
2112 : "a"(line_size)
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2113 : "%"REG_d);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2114 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2115
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2116 #endif // HAVE_MMX