annotate libmpcodecs/vf_fspp.c @ 32282:606e4157cd4c

Split alloc and init of context so that parameters can be set in the context instead of requireing being passed through function parameters. This also makes sws work with AVOptions.
author michael
date Sun, 26 Sep 2010 19:33:57 +0000
parents f957f330aa6d
children 8fa2f43cb760
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1 /*
26727
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
2 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
3 * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
4 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
5 * This file is part of MPlayer.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
6 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
7 * MPlayer is free software; you can redistribute it and/or modify
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
8 * it under the terms of the GNU General Public License as published by
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
9 * the Free Software Foundation; either version 2 of the License, or
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
10 * (at your option) any later version.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
11 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
12 * MPlayer is distributed in the hope that it will be useful,
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
15 * GNU General Public License for more details.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
16 *
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
17 * You should have received a copy of the GNU General Public License along
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
18 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
82601a38e2a7 Use standard license headers.
diego
parents: 26052
diff changeset
20 */
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
21
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
22 /*
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
23 * This implementation is based on an algorithm described in
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
24 * "Aria Nosratinia Embedded Post-Processing for
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
25 * Enhancement of Compressed Images (1999)"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
26 * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
27 * Futher, with splitting (i)dct into hor/ver passes, one of them can be
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
28 * performed once per block, not pixel. This allows for much better speed.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
29 */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
30
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
31 /*
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
32 Heavily optimized version of SPP filter by Nikolaj
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
33 */
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
34
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
35 #include <stdio.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
36 #include <stdlib.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
37 #include <string.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
38 #include <inttypes.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
39 #include <math.h>
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
40
17012
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
41 #include "config.h"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
42
17012
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
43 #include "mp_msg.h"
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
44 #include "cpudetect.h"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
45 #include "img_format.h"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
46 #include "mp_image.h"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
47 #include "vf.h"
31959
f957f330aa6d Introduce init_avcodec function to avoid duplicated FFmpeg initializations.
diego
parents: 30642
diff changeset
48 #include "vd_ffmpeg.h"
17012
6ff3379a0862 Unify include path handling, -I.. is in CFLAGS.
diego
parents: 16018
diff changeset
49 #include "libvo/fastmemcpy.h"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
50
28588
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
51 #include "libavutil/internal.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
52 #include "libavutil/intreadwrite.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
53 #include "libavutil/mem.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
54 #include "libavcodec/avcodec.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
55 #include "libavcodec/dsputil.h"
7f03a6d3c941 Move FFmpeg #includes below all others so that they do not override
diego
parents: 28327
diff changeset
56
28327
c39a1fd7d45c Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
diego
parents: 28290
diff changeset
57 #undef free
c39a1fd7d45c Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
diego
parents: 28290
diff changeset
58 #undef malloc
c39a1fd7d45c Fix compilation after DECLARE_ASM_CONST/DECLARE_ALIGNED moving within FFmpeg.
diego
parents: 28290
diff changeset
59
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
60 //===========================================================================//
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
61 #define BLOCKSZ 12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
62
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
63 static const short custom_threshold[64]=
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
64 // values (296) can't be too high
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
65 // -it causes too big quant dependence
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
66 // or maybe overflow(check), which results in some flashing
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
67 { 71, 296, 295, 237, 71, 40, 38, 19,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
68 245, 193, 185, 121, 102, 73, 53, 27,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
69 158, 129, 141, 107, 97, 73, 50, 26,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
70 102, 116, 109, 98, 82, 66, 45, 23,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
71 71, 94, 95, 81, 70, 56, 38, 20,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
72 56, 77, 74, 66, 56, 44, 30, 15,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
73 38, 53, 50, 45, 38, 30, 21, 11,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
74 20, 27, 26, 23, 20, 15, 11, 5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
75 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
76
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
77 static const uint8_t __attribute__((aligned(32))) dither[8][8]={
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
78 { 0, 48, 12, 60, 3, 51, 15, 63, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
79 { 32, 16, 44, 28, 35, 19, 47, 31, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
80 { 8, 56, 4, 52, 11, 59, 7, 55, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
81 { 40, 24, 36, 20, 43, 27, 39, 23, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
82 { 2, 50, 14, 62, 1, 49, 13, 61, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
83 { 34, 18, 46, 30, 33, 17, 45, 29, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
84 { 10, 58, 6, 54, 9, 57, 5, 53, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
85 { 42, 26, 38, 22, 41, 25, 37, 21, },
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
86 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
87
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
88 struct vf_priv_s { //align 16 !
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
89 uint64_t threshold_mtx_noq[8*2];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
90 uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
91
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
92 int log2_count;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
93 int temp_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
94 int qp;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
95 int mpeg2;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
96 int prev_q;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
97 uint8_t *src;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
98 int16_t *temp;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
99 int bframes;
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
100 char *non_b_qp;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
101 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
102
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
103
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
104 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
105
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
106 //This func reads from 1 slice, 1 and clears 0 & 1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
107 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
108 {int y, x;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
109 #define STORE(pos) \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
110 temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
111 src[x + pos]=src[x + pos - 8*src_stride]=0; \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
112 if(temp & 0x100) temp= ~(temp>>31); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
113 dst[x + pos]= temp;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
114
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
115 for(y=0; y<height; y++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
116 const uint8_t *d= dither[y];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
117 for(x=0; x<width; x+=8){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
118 int temp;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
119 STORE(0);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
120 STORE(1);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
121 STORE(2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
122 STORE(3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
123 STORE(4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
124 STORE(5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
125 STORE(6);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
126 STORE(7);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
127 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
128 src+=src_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
129 dst+=dst_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
130 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
131 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
132
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
133 //This func reads from 2 slices, 0 & 2 and clears 2-nd
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
134 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
135 {int y, x;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
136 #define STORE2(pos) \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
137 temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
138 src[x + pos + 16*src_stride]=0; \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
139 if(temp & 0x100) temp= ~(temp>>31); \
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
140 dst[x + pos]= temp;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
141
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
142 for(y=0; y<height; y++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
143 const uint8_t *d= dither[y];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
144 for(x=0; x<width; x+=8){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
145 int temp;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
146 STORE2(0);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
147 STORE2(1);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
148 STORE2(2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
149 STORE2(3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
150 STORE2(4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
151 STORE2(5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
152 STORE2(6);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
153 STORE2(7);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
154 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
155 src+=src_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
156 dst+=dst_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
157 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
158 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
159
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
160 static void mul_thrmat_c(struct vf_priv_s *p,int q)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
161 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
162 int a;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
163 for(a=0;a<64;a++)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
164 ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
165 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
166
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
167 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
168 static void row_idct_c(DCTELEM* workspace,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
169 int16_t* output_adr, int output_stride, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
170 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
171
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
172 //this is rather ugly, but there is no need for function pointers
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
173 #define store_slice_s store_slice_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
174 #define store_slice2_s store_slice2_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
175 #define mul_thrmat_s mul_thrmat_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
176 #define column_fidct_s column_fidct_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
177 #define row_idct_s row_idct_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
178 #define row_fdct_s row_fdct_c
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
179
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
180 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
181
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
182 //This func reads from 1 slice, 1 and clears 0 & 1
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
183 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
184 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
185 const uint8_t *od=&dither[0][0];
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
186 const uint8_t *end=&dither[height][0];
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
187 width = (width+7)&~7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
188 dst_stride-=width;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
189 //src_stride=(src_stride-width)*2;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
190 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
191 "mov %5, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
192 "mov %6, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
193 "mov %7, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
194 "mov %1, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
195 "movd %%"REG_d", %%mm5 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
196 "xor $-1, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
197 "mov %%"REG_a", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
198 "add $7, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
199 "neg %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
200 "sub %0, %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
201 "add %%"REG_c", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
202 "movd %%"REG_d", %%mm2 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
203 "mov %%"REG_c", %1 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
204 "mov %2, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
205 "shl $4, %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
206
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
207 "2: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
208 "movq (%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
209 "movq %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
210 "pxor %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
211 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
212 "punpckhbw %%mm7, %%mm4 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
213 "mov %0, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
214 "psraw %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
215 "psraw %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
216 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
217 "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
218 "movq (%%"REG_S"), %%mm0 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
219 "movq 8(%%"REG_S"), %%mm1 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
220
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
221 "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
222 "paddw %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
223 "paddw %%mm4, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
224
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
225 "movq %%mm7, (%%"REG_S") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
226 "psraw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
227 "psraw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
228
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
229 "movq %%mm7, 8(%%"REG_S") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
230 "packuswb %%mm1, %%mm0 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
231 "add $16, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
232
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
233 "movq %%mm0, (%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
234 "add $8, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
235 "sub $8, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
236 "jg 1b \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
237 "add %1, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
238 "add $8, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
239 "add %3, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
240 "cmp %4, %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
241 "jl 2b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
242
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
243 :
29310
c35891e664af replace "g" asm constraint by "erm" since "g" allows 64bit immediates while
gpoirier
parents: 29263
diff changeset
244 : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
245 "m" (log2_scale), "m" (src), "m" (dst) //input
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
246 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
247 );
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
248 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
249
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
250 //This func reads from 2 slices, 0 & 2 and clears 2-nd
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
251 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
252 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
253 const uint8_t *od=&dither[0][0];
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
254 const uint8_t *end=&dither[height][0];
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
255 width = (width+7)&~7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
256 dst_stride-=width;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
257 //src_stride=(src_stride-width)*2;
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
258 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
259 "mov %5, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
260 "mov %6, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
261 "mov %7, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
262 "mov %1, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
263 "movd %%"REG_d", %%mm5 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
264 "xor $-1, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
265 "mov %%"REG_a", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
266 "add $7, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
267 "sub %0, %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
268 "add %%"REG_c", %%"REG_c" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
269 "movd %%"REG_d", %%mm2 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
270 "mov %%"REG_c", %1 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
271 "mov %2, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
272 "shl $5, %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
273
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
274 "2: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
275 "movq (%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
276 "movq %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
277 "pxor %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
278 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
279 "punpckhbw %%mm7, %%mm4 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
280 "mov %0, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
281 "psraw %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
282 "psraw %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
283 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
284 "movq (%%"REG_S"), %%mm0 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
285 "movq 8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
286 "paddw %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
287
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
288 "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
289 "paddw %%mm4, %%mm1 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
290 "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
291
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
292 "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
293 "psraw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
294 "paddw %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
295
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
296 "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
297 "psraw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
298 "packuswb %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
299
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
300 "movq %%mm0, (%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
301 "add $16, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
302 "add $8, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
303 "sub $8, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
304 "jg 1b \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
305 "add %1, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
306 "add $8, %%"REG_d" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
307 "add %3, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
308 "cmp %4, %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
309 "jl 2b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
310
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
311 :
29310
c35891e664af replace "g" asm constraint by "erm" since "g" allows 64bit immediates while
gpoirier
parents: 29263
diff changeset
312 : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
313 "m" (log2_scale), "m" (src), "m" (dst) //input
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
314 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
315 );
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
316 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
317
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
318 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
319 {
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
320 uint64_t *adr=&p->threshold_mtx_noq[0];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
321 __asm__ volatile(
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
322 "movd %0, %%mm7 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
323 "add $8*8*2, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
324 "movq 0*8(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
325 "punpcklwd %%mm7, %%mm7 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
326 "movq 1*8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
327 "punpckldq %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
328 "pmullw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
329
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
330 "movq 2*8(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
331 "pmullw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
332
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
333 "movq 3*8(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
334 "pmullw %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
335
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
336 "movq %%mm0, 0*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
337 "movq 4*8(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
338 "pmullw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
339
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
340 "movq %%mm1, 1*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
341 "movq 5*8(%%"REG_S"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
342 "pmullw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
343
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
344 "movq %%mm2, 2*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
345 "movq 6*8(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
346 "pmullw %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
347
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
348 "movq %%mm3, 3*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
349 "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
350 "pmullw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
351
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
352 "movq %%mm4, 4*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
353 "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
354 "pmullw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
355
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
356 "movq %%mm5, 5*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
357 "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
358 "pmullw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
359
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
360 "movq %%mm6, 6*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
361 "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
362 "pmullw %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
363
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
364 "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
365 "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
366 "pmullw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
367
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
368 "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
369 "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
370 "pmullw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
371
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
372 "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
373 "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
374 "pmullw %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
375
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
376 "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
377 "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
378 "pmullw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
379
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
380 "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
381 "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
382 "pmullw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
383
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
384 "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
385 "pmullw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
386
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
387 "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
388 "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
389 "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
390
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
391 : "+g" (q), "+S" (adr), "+D" (adr)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
392 :
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
393 );
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
394 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
395
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
396 static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
397 static void row_idct_mmx(DCTELEM* workspace,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
398 int16_t* output_adr, int output_stride, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
399 static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
400
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
401 #define store_slice_s store_slice_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
402 #define store_slice2_s store_slice2_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
403 #define mul_thrmat_s mul_thrmat_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
404 #define column_fidct_s column_fidct_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
405 #define row_idct_s row_idct_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
406 #define row_fdct_s row_fdct_mmx
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
407 #endif // HAVE_MMX
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
408
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
409 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
410 int dst_stride, int src_stride,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
411 int width, int height,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
412 uint8_t *qp_store, int qp_stride, int is_luma)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
413 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
414 int x, x0, y, es, qy, t;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
415 const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
416 const int step=6-p->log2_count;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
417 const int qps= 3 + is_luma;
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
418 int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
419 DCTELEM *block= (DCTELEM *)block_align;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
420 DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
421
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
422 memset(block3, 0, 4*8*BLOCKSZ);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
423
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
424 //p->src=src-src_stride*8-8;//!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
425 if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
426 for(y=0; y<height; y++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
427 int index= 8 + 8*stride + y*stride;
23457
a124f3abc1ec Replace implicit use of fast_memcpy via macro by explicit use to allow
reimar
parents: 21578
diff changeset
428 fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
429 for(x=0; x<8; x++){
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
430 p->src[index - x - 1]= p->src[index + x ];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
431 p->src[index + width + x ]= p->src[index + width - x - 1];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
432 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
433 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
434 for(y=0; y<8; y++){
23457
a124f3abc1ec Replace implicit use of fast_memcpy via macro by explicit use to allow
reimar
parents: 21578
diff changeset
435 fast_memcpy(p->src + ( 7-y)*stride, p->src + ( y+8)*stride, stride);
a124f3abc1ec Replace implicit use of fast_memcpy via macro by explicit use to allow
reimar
parents: 21578
diff changeset
436 fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
437 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
438 //FIXME (try edge emu)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
439
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
440 for(y=8; y<24; y++)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
441 memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
442
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
443 for(y=step; y<height+8; y+=step){ //step= 1,2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
444 qy=y-4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
445 if (qy>height-1) qy=height-1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
446 if (qy<0) qy=0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
447 qy=(qy>>qps)*qp_stride;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
448 row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
449 for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
450 row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
451 if(p->qp)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
452 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
453 else
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
454 for (x=0; x<8*(BLOCKSZ-1); x+=8) {
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
455 t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
456 if (t<0) t=0;//t always < width-2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
457 t=qp_store[qy+(t>>qps)];
30412
41fb4acf3df6 Support more qscale types in most post-processing filters.
reimar
parents: 30363
diff changeset
458 t=norm_qscale(t, p->mpeg2);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
459 if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
460 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
461 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
462 row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
25568
707b810a2558 fix artifacts in -vf fspp. regression in r23476.
lorenm
parents: 25221
diff changeset
463 memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); //cycling
707b810a2558 fix artifacts in -vf fspp. regression in r23476.
lorenm
parents: 25221
diff changeset
464 memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
465 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
466 //
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
467 es=width+8-x0; // 8, ...
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
468 if (es>8)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
469 row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
21578
9345eb2d8c8f count needs to be even
henry
parents: 20585
diff changeset
470 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
471 row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
472 {const int y1=y-8+step;//l5-7 l4-6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
473 if (!(y1&7) && y1) {
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
474 if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
475 dst_stride, stride, width, 8, 5-p->log2_count);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
476 else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
477 dst_stride, stride, width, 8, 5-p->log2_count);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
478 } }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
479 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
480
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
481 if (y&7) { // == height & 7
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
482 if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
483 dst_stride, stride, width, y&7, 5-p->log2_count);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
484 else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
485 dst_stride, stride, width, y&7, 5-p->log2_count);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
486 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
487 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
488
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
489 static int config(struct vf_instance *vf,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
490 int width, int height, int d_width, int d_height,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
491 unsigned int flags, unsigned int outfmt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
492 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
493 int h= (height+16+15)&(~15);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
494
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
495 vf->priv->temp_stride= (width+16+15)&(~15);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
496 vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
497 //this can also be avoided, see above
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
498 vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
499
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
500 return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
501 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
502
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
503 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
504 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
505 if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
506 // ok, we can do pp in-place (or pp disabled):
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
507 vf->dmpi=vf_get_image(vf->next,mpi->imgfmt,
16018
bdf1b4ecb906 use stored dimensions instead of visible one when (vf_)get_image is called
iive
parents: 15651
diff changeset
508 mpi->type, mpi->flags, mpi->width, mpi->height);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
509 mpi->planes[0]=vf->dmpi->planes[0];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
510 mpi->stride[0]=vf->dmpi->stride[0];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
511 mpi->width=vf->dmpi->width;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
512 if(mpi->flags&MP_IMGFLAG_PLANAR){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
513 mpi->planes[1]=vf->dmpi->planes[1];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
514 mpi->planes[2]=vf->dmpi->planes[2];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
515 mpi->stride[1]=vf->dmpi->stride[1];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
516 mpi->stride[2]=vf->dmpi->stride[2];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
517 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
518 mpi->flags|=MP_IMGFLAG_DIRECT;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
519 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
520
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
521 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
522 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
523 mp_image_t *dmpi;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
524 if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
525 // no DR, so get a new image! hope we'll get DR buffer:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
526 dmpi=vf_get_image(vf->next,mpi->imgfmt,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
527 MP_IMGTYPE_TEMP,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
528 MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
16018
bdf1b4ecb906 use stored dimensions instead of visible one when (vf_)get_image is called
iive
parents: 15651
diff changeset
529 mpi->width,mpi->height);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
530 vf_clone_mpi_attributes(dmpi, mpi);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
531 }else{
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
532 dmpi=vf->dmpi;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
533 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
534
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
535 vf->priv->mpeg2= mpi->qscale_type;
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
536 if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
30363
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
537 int w = mpi->qstride;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
538 int h = (mpi->h + 15) >> 4;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
539 if (!w) {
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
540 w = (mpi->w + 15) >> 4;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
541 h = 1;
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
542 }
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
543 if(!vf->priv->non_b_qp)
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
544 vf->priv->non_b_qp= malloc(w*h);
915be5c7a30c Make sure that a qstride of 0 (intentional or not) does not completely break
reimar
parents: 29310
diff changeset
545 fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
546 }
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
547 if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
548 char *qp_tab= vf->priv->non_b_qp;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
549 if(vf->priv->bframes || !qp_tab)
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
550 qp_tab= mpi->qscale;
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
551
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
552 if(qp_tab || vf->priv->qp){
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
553 filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
554 mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
555 filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
556 mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
557 filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
558 mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
559 }else{
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
560 memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
561 memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
562 memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
563 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
564 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
565
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
566 #if HAVE_MMX
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
567 if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
568 #endif
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
569 #if HAVE_MMX2
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
570 if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
571 #endif
17906
20aca9baf5d8 passing pts through the filter layer (lets see if pts or cola comes out at the end)
michael
parents: 17523
diff changeset
572 return vf_next_put_image(vf,dmpi, pts);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
573 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
574
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
575 static void uninit(struct vf_instance *vf)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
576 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
577 if(!vf->priv) return;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
578
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
579 if(vf->priv->temp) av_free(vf->priv->temp);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
580 vf->priv->temp= NULL;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
581 if(vf->priv->src) av_free(vf->priv->src);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
582 vf->priv->src= NULL;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
583 //if(vf->priv->avctx) free(vf->priv->avctx);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
584 //vf->priv->avctx= NULL;
17133
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
585 if(vf->priv->non_b_qp) free(vf->priv->non_b_qp);
a2b24e0d7772 prevent flicker on b-frames, trivial port from vf_spp
henry
parents: 17012
diff changeset
586 vf->priv->non_b_qp= NULL;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
587
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
588 av_free(vf->priv);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
589 vf->priv=NULL;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
590 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
591
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
592 //===========================================================================//
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
593
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
594 static int query_format(struct vf_instance *vf, unsigned int fmt)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
595 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
596 switch(fmt){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
597 case IMGFMT_YVU9:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
598 case IMGFMT_IF09:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
599 case IMGFMT_YV12:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
600 case IMGFMT_I420:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
601 case IMGFMT_IYUV:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
602 case IMGFMT_CLPL:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
603 case IMGFMT_Y800:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
604 case IMGFMT_Y8:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
605 case IMGFMT_444P:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
606 case IMGFMT_422P:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
607 case IMGFMT_411P:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
608 return vf_next_query_format(vf,fmt);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
609 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
610 return 0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
611 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
612
30642
a972c1a4a012 cosmetics: Rename struct vf_instance_s --> vf_instance.
diego
parents: 30638
diff changeset
613 static int control(struct vf_instance *vf, int request, void* data)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
614 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
615 switch(request){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
616 case VFCTRL_QUERY_MAX_PP_LEVEL:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
617 return 5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
618 case VFCTRL_SET_PP_LEVEL:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
619 vf->priv->log2_count= *((unsigned int*)data);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
620 if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
621 return CONTROL_TRUE;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
622 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
623 return vf_next_control(vf,request,data);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
624 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
625
30638
a7b908875c14 Rename open() vf initialization function to vf_open().
diego
parents: 30412
diff changeset
626 static int vf_open(vf_instance_t *vf, char *args)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
627 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
628 int i=0, bias;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
629 int custom_threshold_m[64];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
630 int log2c=-1;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
631
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
632 vf->config=config;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
633 vf->put_image=put_image;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
634 vf->get_image=get_image;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
635 vf->query_format=query_format;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
636 vf->uninit=uninit;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
637 vf->control= control;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
638 vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
639
31959
f957f330aa6d Introduce init_avcodec function to avoid duplicated FFmpeg initializations.
diego
parents: 30642
diff changeset
640 init_avcodec();
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
641
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
642 //vf->priv->avctx= avcodec_alloc_context();
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
643 //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
644
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
645 vf->priv->log2_count= 4;
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
646 vf->priv->bframes = 0;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
647
17225
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
648 if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
649
ec9888363742 reverse the H264 hack
henry
parents: 17223
diff changeset
650 if( log2c >=4 && log2c <=5 )
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
651 vf->priv->log2_count = log2c;
15651
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
652 else if( log2c >= 6 )
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
653 vf->priv->log2_count = 5;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
654
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
655 if(vf->priv->qp < 0)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
656 vf->priv->qp = 0;
15651
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
657
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
658 if (i < -15) i = -15;
6a0494e09435 sanity checks for options; treat quality > 5 as 5, not 4
henry
parents: 15634
diff changeset
659 if (i > 32) i = 32;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
660
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
661 bias= (1<<4)+i; //regulable
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
662 vf->priv->prev_q=0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
663 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
664 for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
665 custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
666 for(i=0;i<8;i++){
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
667 vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
668 |(((uint64_t)custom_threshold_m[i*8+6])<<16)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
669 |(((uint64_t)custom_threshold_m[i*8+0])<<32)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
670 |(((uint64_t)custom_threshold_m[i*8+4])<<48);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
671 vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
672 |(((uint64_t)custom_threshold_m[i*8+3])<<16)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
673 |(((uint64_t)custom_threshold_m[i*8+1])<<32)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
674 |(((uint64_t)custom_threshold_m[i*8+7])<<48);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
675 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
676
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
677 if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
678
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
679 return 1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
680 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
681
25221
00fff9a3b735 Make all vf_info_t structs const
reimar
parents: 24976
diff changeset
682 const vf_info_t vf_info_fspp = {
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
683 "fast simple postprocess",
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
684 "fspp",
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
685 "Michael Niedermayer, Nikolaj Poroshin",
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
686 "",
30638
a7b908875c14 Rename open() vf initialization function to vf_open().
diego
parents: 30412
diff changeset
687 vf_open,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
688 NULL
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
689 };
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
690
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
691 //====================================================================
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
692 //Specific spp's dct, idct and threshold functions
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
693 //I'd prefer to have them in the separate file.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
694
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
695 //#define MANGLE(a) #a
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
696
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
697 //typedef int16_t DCTELEM; //! only int16_t
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
698
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
699 #define DCTSIZE 8
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
700 #define DCTSIZE_S "8"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
701
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
702 #define FIX(x,s) ((int) ((x) * (1<<s) + 0.5)&0xffff)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
703 #define C64(x) ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
704 #define FIX64(x,s) C64(FIX(x,s))
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
705
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
706 #define MULTIPLY16H(x,k) (((x)*(k))>>16)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
707 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
708 #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> n)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
709
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
710 #if HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
711
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
712 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_541196100)=FIX64(0.541196100, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
714 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_707106781)=FIX64(0.707106781, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
715 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
716
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
717 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
718
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
719 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
25901
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
25901
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
722 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
723 //for t3,t5,t7 == 0 shortcut
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
724 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
726 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
727
25901
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
728 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
c2210e68a2a9 Simplify: use DECLARE_ASM_CONST
reimar
parents: 25568
diff changeset
729 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
730
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
731 #else /* !HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
732
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
733 typedef int32_t int_simd16_t;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
734 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
735 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
736 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
737 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
738 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
739 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
25902
15ab840747e2 mark constants as such
reimar
parents: 25901
diff changeset
740 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
741 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
25902
15ab840747e2 mark constants as such
reimar
parents: 25901
diff changeset
742 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
743
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
744 #endif
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
745
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
746 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
747
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
748 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
749 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
750 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
751 int_simd16_t tmp10, tmp11, tmp12, tmp13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
752 int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
753 int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
754
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
755 DCTELEM* dataptr;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
756 DCTELEM* wsptr;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
757 int16_t *threshold;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
758 int ctr;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
759
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
760 dataptr = data;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
761 wsptr = output;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
762
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
763 for (; cnt > 0; cnt-=2) { //start positions
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
764 threshold=(int16_t*)thr_adr;//threshold_mtx
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
765 for (ctr = DCTSIZE; ctr > 0; ctr--) {
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
766 // Process columns from input, add to output.
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
767 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
768 tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
769
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
770 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
771 tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
772
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
773 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
774 tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
775
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
776 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
777 tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
778
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
779 // Even part of FDCT
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
780
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
781 tmp10 = tmp0 + tmp3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
782 tmp13 = tmp0 - tmp3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
783 tmp11 = tmp1 + tmp2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
784 tmp12 = tmp1 - tmp2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
785
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
786 d0 = tmp10 + tmp11;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
787 d4 = tmp10 - tmp11;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
788
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
789 z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
790 d2 = tmp13 + z1;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
791 d6 = tmp13 - z1;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
792
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
793 // Even part of IDCT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
794
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
795 THRESHOLD(tmp0, d0, threshold[0*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
796 THRESHOLD(tmp1, d2, threshold[2*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
797 THRESHOLD(tmp2, d4, threshold[4*8]);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
798 THRESHOLD(tmp3, d6, threshold[6*8]);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
799 tmp0+=2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
800 tmp10 = (tmp0 + tmp2)>>2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
801 tmp11 = (tmp0 - tmp2)>>2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
802
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
803 tmp13 = (tmp1 + tmp3)>>2; //+2 ! (psnr decides)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
804 tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
805
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
806 tmp0 = tmp10 + tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
807 tmp3 = tmp10 - tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
808 tmp1 = tmp11 + tmp12; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
809 tmp2 = tmp11 - tmp12; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
810
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
811 // Odd part of FDCT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
812
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
813 tmp10 = tmp4 + tmp5;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
814 tmp11 = tmp5 + tmp6;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
815 tmp12 = tmp6 + tmp7;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
816
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
817 z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
818 z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
819 z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
820 z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
821
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
822 z11 = tmp7 + z3;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
823 z13 = tmp7 - z3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
824
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
825 d5 = z13 + z2;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
826 d3 = z13 - z2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
827 d1 = z11 + z4;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
828 d7 = z11 - z4;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
829
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
830 // Odd part of IDCT
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
831
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
832 THRESHOLD(tmp4, d1, threshold[1*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
833 THRESHOLD(tmp5, d3, threshold[3*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
834 THRESHOLD(tmp6, d5, threshold[5*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
835 THRESHOLD(tmp7, d7, threshold[7*8]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
836
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
837 //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
838 z13 = tmp6 + tmp5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
839 z10 = (tmp6 - tmp5)<<1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
840 z11 = tmp4 + tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
841 z12 = (tmp4 - tmp7)<<1;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
842
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
843 tmp7 = (z11 + z13)>>2; //+2 !
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
844 tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
845 z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
846 tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
847 tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
848
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
849 tmp6 = tmp12 - tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
850 tmp5 = tmp11 - tmp6;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
851 tmp4 = tmp10 + tmp5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
852
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
853 wsptr[DCTSIZE*0]+= (tmp0 + tmp7);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
854 wsptr[DCTSIZE*1]+= (tmp1 + tmp6);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
855 wsptr[DCTSIZE*2]+= (tmp2 + tmp5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
856 wsptr[DCTSIZE*3]+= (tmp3 - tmp4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
857 wsptr[DCTSIZE*4]+= (tmp3 + tmp4);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
858 wsptr[DCTSIZE*5]+= (tmp2 - tmp5);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
859 wsptr[DCTSIZE*6]= (tmp1 - tmp6);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
860 wsptr[DCTSIZE*7]= (tmp0 - tmp7);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
861 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
862 dataptr++; //next column
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
863 wsptr++;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
864 threshold++;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
865 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
866 dataptr+=8; //skip each second start pos
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
867 wsptr +=8;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
868 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
869 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
870
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
871 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
872
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
873 static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
874 {
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
875 uint64_t __attribute__((aligned(8))) temps[4];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
876 __asm__ volatile(
19372
6334c14b38eb Replace asmalign.h hack by ASMALIGN cpp macros from config.h.
diego
parents: 18131
diff changeset
877 ASMALIGN(4)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
878 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
879 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
880 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
881 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
882 "movq %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
883
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
884 "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
885 "movq %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
886
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
887 "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
888 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
889
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
890 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
891 "psubw %%mm7, %%mm1 \n\t" //t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
892
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
893 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
894 "movq %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
895
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
896 "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
897 "paddw %%mm7, %%mm5 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
898
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
899 "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
900 "movq %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
901
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
902 "paddw %%mm2, %%mm6 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
903 "psubw %%mm2, %%mm7 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
904
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
905 "movq %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
906 "paddw %%mm6, %%mm5 \n\t" //d0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
907 // i0 t13 t12 i3 i1 d0 - d4
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
908 "psubw %%mm6, %%mm2 \n\t" //d4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
909 "paddw %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
910
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
911 "movq 4*16(%%"REG_d"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
912 "psllw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
913
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
914 "psubw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
915 "psubw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
916
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
917 "paddusw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
918 "paddusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
919
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
920 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
921 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
922 "paddw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
923 "paddw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
924
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
925 "psubusw 0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
926 "psubusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
927
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
928 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
929 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
930 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
931 "paddw "MANGLE(MM_2)", %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
932 "movq %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
933
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
934 "paddw %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
935 "psubw %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
936
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
937 "movq %%mm1, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
938 "paddw %%mm7, %%mm1 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
939
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
940 "psubw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
941 "psubw %%mm7, %%mm6 \n\t" //d6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
942
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
943 "movq 6*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
944 "psraw $2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
945
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
946 "paddusw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
947 "psubw %%mm7, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
948 // t7 d2 /t11 t4 t6 - d6 /t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
949
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
950 "paddw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
951 "paddusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
952
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
953 "psubusw 2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
954 "paddw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
955
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
956 "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
957 "psubusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
958
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
959 //movq [edi+"DCTSIZE_S"*2*2], mm1
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
960 //movq [edi+"DCTSIZE_S"*6*2], mm6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
961 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
962 "psraw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
963
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
964 "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
965 "psubw %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
966
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
967 "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
968 "paddw %%mm7, %%mm6 \n\t" //'t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
969
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
970 "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
971 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
972
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
973 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
974 "paddw %%mm6, %%mm2 \n\t" //'t0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
975
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
976 "movq %%mm2, 0*8+%3 \n\t" //!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
977 "psubw %%mm6, %%mm7 \n\t" //'t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
978
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
979 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
980 "psubw %%mm6, %%mm1 \n\t" //'t12
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
981
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
982 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
983 "movq %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
984
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
985 "movq %%mm7, 3*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
986 "paddw %%mm2, %%mm3 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
987
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
988 "paddw %%mm4, %%mm2 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
989 "paddw %%mm0, %%mm4 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
990
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
991 "movq %%mm3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
992 "psubw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
993
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
994 "psllw $2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
995 "psllw $2, %%mm7 \n\t" //opt for P6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
996
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
997 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
998 "psllw $2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
999
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1000 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1001 "psllw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1002
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1003 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1004 "paddw %%mm1, %%mm5 \n\t" //'t1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1005
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1006 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1007 "psubw %%mm1, %%mm6 \n\t" //'t2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1008 // t7 't12 't11 t4 t6 - 't13 't10 ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1009
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1010 "paddw %%mm3, %%mm7 \n\t" //z2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1011
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1012 "movq %%mm5, 1*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1013 "paddw %%mm3, %%mm4 \n\t" //z4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1014
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1015 "movq 3*16(%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1016 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1017
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1018 "movq %%mm6, 2*8+%3 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1019 "psubw %%mm2, %%mm1 \n\t" //z13
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1020
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1021 //===
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1022 "paddw %%mm2, %%mm0 \n\t" //z11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1023 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1024
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1025 "movq 5*16(%%"REG_d"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1026 "psubw %%mm7, %%mm1 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1027
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1028 "paddw %%mm7, %%mm5 \n\t" //d5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1029 "psubw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1030
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1031 "movq 1*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1032 "psubw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1033
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1034 "movq %%mm0, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1035 "paddw %%mm4, %%mm0 \n\t" //d1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1036
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1037 "paddusw %%mm3, %%mm1 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1038 "psubw %%mm4, %%mm6 \n\t" //d7
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1039
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1040 // d1 d3 - - - d5 d7 -
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1041 "movq 7*16(%%"REG_d"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1042 "psubw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1043
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1044 "psubw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1045 "paddusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1046
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1047 "paddusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1048 "paddw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1049
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1050 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1051 "paddw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1052
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1053 "psubusw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1054 "psubusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1055
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1056 "psubusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1057 "movq %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1058
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1059 "por %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1060 "paddusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1061
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1062 "por %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1063 "paddw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1064
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1065 "packssdw %%mm4, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1066 "psubusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1067
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1068 "movd %%mm4, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1069 "or %%"REG_a", %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1070 "jnz 2f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1071 //movq [edi+"DCTSIZE_S"*3*2], mm1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1072 //movq [edi+"DCTSIZE_S"*5*2], mm5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1073 //movq [edi+"DCTSIZE_S"*1*2], mm0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1074 //movq [edi+"DCTSIZE_S"*7*2], mm6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1075 // t4 t5 - - - t6 t7 -
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1076 //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1077 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1078 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1079 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1080
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1081 "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1082 "movq %%mm1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1083
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1084 "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1085 "movq %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1086
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1087 "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1088 "paddw %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1089
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1090 "movq 1*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1091 //paddw mm3, MM_2
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1092 "psraw $2, %%mm3 \n\t" //tmp7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1093
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1094 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1095 "psubw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1096
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1097 "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1098 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1099
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1100 "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1101 "paddw %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1102
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1103 "movq 2*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1104 "psubw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1105
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1106 "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1107 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1108
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1109 "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1110 "paddw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1111
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1112 "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1113 "psubw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1114
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1115 "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1116 "paddw %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1117
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1118 "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1119 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1120
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1121 "movq 3*8+%3, %%mm0 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1122 "add $8, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1123
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1124 "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1125 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1126
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1127 "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1128 "psubw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1129
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1130 "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1131 "paddw %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1132
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1133 "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1134 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1135
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1136 "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1137
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1138 "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1139 "add $8, %%"REG_D" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1140 "jmp 4f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1141
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1142 "2: \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1143 //--- non DC2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1144 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1145 //psraw mm5, 2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1146 //psraw mm0, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1147 //psraw mm6, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1148 "movq %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1149 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1150
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1151 "psllw $1, %%mm5 \n\t" //'z10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1152 "paddw %%mm1, %%mm3 \n\t" //'z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1153
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1154 "movq %%mm0, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1155 "psubw %%mm6, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1156
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1157 "movq %%mm5, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1158 "psllw $1, %%mm0 \n\t" //'z12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1159
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1160 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1161 "paddw %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1162
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1163 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1164 "paddw %%mm6, %%mm2 \n\t" //'z11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1165
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1166 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1167 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1168
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1169 //---
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1170 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1171 "psubw %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1172
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1173 "psllw $1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1174 "paddw %%mm3, %%mm7 \n\t" //'t7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1175
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1176 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1177 "movq %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1178 //paddw mm7, MM_2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1179 "psraw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1180
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1181 "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1182 "psubw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1183
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1184 "movq 1*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1185 "paddw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1186
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1187 "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1188 "paddw %%mm5, %%mm1 \n\t" //'t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1189
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1190 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1191 "psubw %%mm7, %%mm1 \n\t" //'t6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1192
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1193 "movq 2*8+%3, %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1194 "psubw %%mm5, %%mm0 \n\t" //'t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1195
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1196 "movq 3*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1197 "movq %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1198
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1199 "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1200 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1201
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1202 "psubw %%mm1, %%mm2 \n\t" //'t5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1203 "paddw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1204
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1205 "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1206 "movq %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1207
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1208 "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1209 "psubw %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1210
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1211 "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1212 "paddw %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1213
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1214 "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1215 "paddw %%mm2, %%mm0 \n\t" //'t4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1216
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1217 // 't4 't6 't5 - - - - 't7
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1218 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1219 "movq %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1220
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1221 "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1222 "psubw %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1223
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1224 "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1225 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1226
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1227 "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1228 "add $8, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1229
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1230 "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1231
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1232 "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1233 "add $8, %%"REG_D" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1234
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1235 "4: \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1236 //=part 2 (the same)===========================================================
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1237 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1238 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1239 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1240 "movq %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1241
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1242 "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1243 "movq %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1244
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1245 "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1246 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1247
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1248 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1249 "psubw %%mm7, %%mm1 \n\t" //t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1250
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1251 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1252 "movq %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1253
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1254 "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1255 "paddw %%mm7, %%mm5 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1256
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1257 "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1258 "movq %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1259
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1260 "paddw %%mm2, %%mm6 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1261 "psubw %%mm2, %%mm7 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1262
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1263 "movq %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1264 "paddw %%mm6, %%mm5 \n\t" //d0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1265 // i0 t13 t12 i3 i1 d0 - d4
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1266 "psubw %%mm6, %%mm2 \n\t" //d4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1267 "paddw %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1268
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1269 "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1270 "psllw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1271
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1272 "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1273 "psubw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1274
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1275 "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1276 "paddusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1277
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1278 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1279 //
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1280 "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1281 "paddw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1282
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1283 "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1284 "psubusw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1285
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1286 //This func is totally compute-bound, operates at huge speed. So, DC shortcut
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1287 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1288 //However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1289 "paddw "MANGLE(MM_2)", %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1290 "movq %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1291
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1292 "paddw %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1293 "psubw %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1294
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1295 "movq %%mm1, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1296 "paddw %%mm7, %%mm1 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1297
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1298 "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1299 "psubw %%mm7, %%mm6 \n\t" //d6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1300
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1301 "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1302 "psraw $2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1303
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1304 "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1305 "psubw %%mm7, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1306 // t7 d2 /t11 t4 t6 - d6 /t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1307
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1308 "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1309 "paddusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1310
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1311 "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1312 "paddw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1313
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1314 "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1315 "psubusw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1316
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1317 //movq [edi+"DCTSIZE_S"*2*2], mm1
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1318 //movq [edi+"DCTSIZE_S"*6*2], mm6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1319 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1320 "psraw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1321
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1322 "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1323 "psubw %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1324
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1325 "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1326 "paddw %%mm7, %%mm6 \n\t" //'t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1327
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1328 "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1329 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1330
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1331 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1332 "paddw %%mm6, %%mm2 \n\t" //'t0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1333
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1334 "movq %%mm2, 0*8+%3 \n\t" //!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1335 "psubw %%mm6, %%mm7 \n\t" //'t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1336
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1337 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1338 "psubw %%mm6, %%mm1 \n\t" //'t12
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1339
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1340 "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1341 "movq %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1342
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1343 "movq %%mm7, 3*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1344 "paddw %%mm2, %%mm3 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1345
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1346 "paddw %%mm4, %%mm2 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1347 "paddw %%mm0, %%mm4 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1348
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1349 "movq %%mm3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1350 "psubw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1351
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1352 "psllw $2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1353 "psllw $2, %%mm7 \n\t" //opt for P6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1354
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1355 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1356 "psllw $2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1357
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1358 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1359 "psllw $2, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1360
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1361 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1362 "paddw %%mm1, %%mm5 \n\t" //'t1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1363
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1364 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1365 "psubw %%mm1, %%mm6 \n\t" //'t2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1366 // t7 't12 't11 t4 t6 - 't13 't10 ---
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1367
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1368 "paddw %%mm3, %%mm7 \n\t" //z2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1369
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1370 "movq %%mm5, 1*8+%3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1371 "paddw %%mm3, %%mm4 \n\t" //z4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1372
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1373 "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1374 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1375
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1376 "movq %%mm6, 2*8+%3 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1377 "psubw %%mm2, %%mm1 \n\t" //z13
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1378
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1379 //===
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1380 "paddw %%mm2, %%mm0 \n\t" //z11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1381 "movq %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1382
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1383 "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1384 "psubw %%mm7, %%mm1 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1385
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1386 "paddw %%mm7, %%mm5 \n\t" //d5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1387 "psubw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1388
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1389 "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1390 "psubw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1391
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1392 "movq %%mm0, %%mm6 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1393 "paddw %%mm4, %%mm0 \n\t" //d1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1394
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1395 "paddusw %%mm3, %%mm1 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1396 "psubw %%mm4, %%mm6 \n\t" //d7
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1397
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1398 // d1 d3 - - - d5 d7 -
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1399 "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1400 "psubw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1401
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1402 "psubw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1403 "paddusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1404
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1405 "paddusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1406 "paddw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1407
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1408 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1409 "paddw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1410
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1411 "psubusw %%mm3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1412 "psubusw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1413
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1414 "psubusw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1415 "movq %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1416
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1417 "por %%mm5, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1418 "paddusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1419
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1420 "por %%mm6, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1421 "paddw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1422
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1423 "packssdw %%mm4, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1424 "psubusw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1425
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1426 "movd %%mm4, %%"REG_a" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1427 "or %%"REG_a", %%"REG_a" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1428 "jnz 3f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1429 //movq [edi+"DCTSIZE_S"*3*2], mm1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1430 //movq [edi+"DCTSIZE_S"*5*2], mm5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1431 //movq [edi+"DCTSIZE_S"*1*2], mm0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1432 //movq [edi+"DCTSIZE_S"*7*2], mm6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1433 // t4 t5 - - - t6 t7 -
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1434 //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1435 //Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1436 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1437 "movq %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1438
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1439 "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1440 "movq %%mm1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1441
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1442 "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1443 "movq %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1444
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1445 "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1446 "paddw %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1447
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1448 "movq 1*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1449 //paddw mm3, MM_2
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1450 "psraw $2, %%mm3 \n\t" //tmp7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1451
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1452 "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1453 "psubw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1454
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1455 "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1456 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1457
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1458 "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1459 "paddw %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1460
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1461 "movq 2*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1462 "psubw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1463
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1464 "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1465 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1466
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1467 "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1468 "paddw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1469
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1470 "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1471 "psubw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1472
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1473 "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1474 "paddw %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1475
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1476 "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1477 "paddw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1478
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1479 "movq 3*8+%3, %%mm0 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1480 "add $24, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1481
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1482 "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1483 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1484
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1485 "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1486 "psubw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1487
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1488 "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1489 "paddw %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1490
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1491 "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1492 "paddw %%mm0, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1493
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1494 "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1495
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1496 "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1497 "add $24, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1498 "sub $2, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1499 "jnz 1b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1500 "jmp 5f \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1501
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1502 "3: \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1503 //--- non DC2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1504 //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1)
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1505 //psraw mm5, 2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1506 //psraw mm0, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1507 //psraw mm6, 2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1508 "movq %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1509 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1510
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1511 "psllw $1, %%mm5 \n\t" //'z10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1512 "paddw %%mm1, %%mm3 \n\t" //'z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1513
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1514 "movq %%mm0, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1515 "psubw %%mm6, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1516
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1517 "movq %%mm5, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1518 "psllw $1, %%mm0 \n\t" //'z12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1519
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1520 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1521 "paddw %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1522
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1523 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1524 "paddw %%mm6, %%mm2 \n\t" //'z11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1525
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1526 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1527 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1528
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1529 //---
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1530 "movq 0*8+%3, %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1531 "psubw %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1532
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1533 "psllw $1, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1534 "paddw %%mm3, %%mm7 \n\t" //'t7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1535
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1536 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1537 "movq %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1538 //paddw mm7, MM_2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1539 "psraw $2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1540
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1541 "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1542 "psubw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1543
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1544 "movq 1*8+%3, %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1545 "paddw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1546
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1547 "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1548 "paddw %%mm5, %%mm1 \n\t" //'t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1549
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1550 "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1551 "psubw %%mm7, %%mm1 \n\t" //'t6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1552
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1553 "movq 2*8+%3, %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1554 "psubw %%mm5, %%mm0 \n\t" //'t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1555
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1556 "movq 3*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1557 "movq %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1558
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1559 "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1560 "psubw %%mm1, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1561
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1562 "psubw %%mm1, %%mm2 \n\t" //'t5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1563 "paddw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1564
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1565 "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1566 "movq %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1567
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1568 "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1569 "psubw %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1570
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1571 "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1572 "paddw %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1573
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1574 "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1575 "paddw %%mm2, %%mm0 \n\t" //'t4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1576
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1577 // 't4 't6 't5 - - - - 't7
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1578 "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1579 "movq %%mm6, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1580
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1581 "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1582 "psubw %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1583
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1584 "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1585 "paddw %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1586
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1587 "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1588 "add $24, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1589
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1590 "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1591
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1592 "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1593 "add $24, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1594 "sub $2, %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1595 "jnz 1b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1596 "5: \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1597
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1598 : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
15632
e813a3e431a8 move unchanged registers back to input spec
henry
parents: 15631
diff changeset
1599 : "d"(thr_adr)
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1600 : "%"REG_a
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1601 );
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1602 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1603
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1604 #endif // HAVE_MMX
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1605
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
1606 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1607
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1608 static void row_idct_c(DCTELEM* workspace,
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1609 int16_t* output_adr, int output_stride, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1610 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1611 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1612 int_simd16_t tmp10, tmp11, tmp12, tmp13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1613 int_simd16_t z5, z10, z11, z12, z13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1614 int16_t* outptr;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1615 DCTELEM* wsptr;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1616
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1617 cnt*=4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1618 wsptr = workspace;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1619 outptr = output_adr;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1620 for (; cnt > 0; cnt--) {
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1621 // Even part
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1622 //Simd version reads 4x4 block and transposes it
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1623 tmp10 = ( wsptr[2] + wsptr[3]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1624 tmp11 = ( wsptr[2] - wsptr[3]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1625
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1626 tmp13 = ( wsptr[0] + wsptr[1]);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1627 tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1628
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1629 tmp0 = tmp10 + tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1630 tmp3 = tmp10 - tmp13; //->temps
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1631 tmp1 = tmp11 + tmp12;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1632 tmp2 = tmp11 - tmp12;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1633
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1634 // Odd part
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1635 //Also transpose, with previous:
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1636 // ---- ---- ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1637 // ---- ---- idct ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1638 // ---- ---- ---> ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1639 // ---- ---- ||||
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1640 z13 = wsptr[4] + wsptr[5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1641 z10 = wsptr[4] - wsptr[5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1642 z11 = wsptr[6] + wsptr[7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1643 z12 = wsptr[6] - wsptr[7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1644
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1645 tmp7 = z11 + z13;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1646 tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1647
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1648 z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1649 tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1650 tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1651
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1652 tmp6 = (tmp12<<3) - tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1653 tmp5 = (tmp11<<3) - tmp6;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1654 tmp4 = (tmp10<<3) + tmp5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1655
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1656 // Final output stage: descale and write column
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1657 outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1658 outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1659 outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1660 outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1661 outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1662 outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1663 outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1664 outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1665 outptr++;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1666
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1667 wsptr += DCTSIZE; // advance pointer to next row
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1668 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1669 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1670
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1671 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1672
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1673 static void row_idct_mmx (DCTELEM* workspace,
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1674 int16_t* output_adr, int output_stride, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1675 {
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1676 uint64_t __attribute__((aligned(8))) temps[4];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
1677 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1678 "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1679 "1: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1680 "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1681 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1682
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1683 "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1684 "movq %%mm0, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1685
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1686 "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1687 "punpcklwd %%mm1, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1688
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1689 "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1690 "punpckhwd %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1691
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1692 //transpose 4x4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1693 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1694 "punpcklwd %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1695
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1696 "movq %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1697 "punpckldq %%mm2, %%mm0 \n\t" //0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1698
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1699 "punpckhdq %%mm2, %%mm6 \n\t" //1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1700 "movq %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1701
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1702 "punpckhwd %%mm3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1703 "psubw %%mm6, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1704
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1705 "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1706 "movq %%mm4, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1707
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1708 "punpckldq %%mm7, %%mm4 \n\t" //2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1709 "paddw %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1710
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1711 "punpckhdq %%mm7, %%mm2 \n\t" //3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1712 "movq %%mm4, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1713
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1714 "psllw $2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1715 "paddw %%mm2, %%mm4 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1716
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1717 "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1718 "psubw %%mm2, %%mm1 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1719
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1720 "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1721 "psubw %%mm5, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1722
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1723 "movq %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1724 "paddw %%mm5, %%mm4 \n\t" //t0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1725
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1726 "psubw %%mm5, %%mm6 \n\t" //t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1727 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1728
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1729 "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1730 "paddw %%mm0, %%mm1 \n\t" //t1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1731
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1732 "movq %%mm4, 0*8+%3 \n\t" //t0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1733 "movq %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1734
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1735 "movq %%mm6, 1*8+%3 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1736 "punpcklwd %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1737
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1738 //transpose 4x4
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1739 "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1740 "punpckhwd %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1741
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1742 "movq %%mm5, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1743 "punpcklwd %%mm6, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1744
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1745 "psubw %%mm0, %%mm7 \n\t" //t2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1746 "punpckhwd %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1747
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1748 "movq %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1749 "punpckldq %%mm5, %%mm3 \n\t" //4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1750
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1751 "punpckhdq %%mm5, %%mm0 \n\t" //5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1752 "movq %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1753
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1754 //
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1755 "movq %%mm3, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1756 "punpckldq %%mm2, %%mm4 \n\t" //6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1757
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1758 "psubw %%mm0, %%mm3 \n\t" //z10
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1759 "punpckhdq %%mm2, %%mm5 \n\t" //7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1760
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1761 "paddw %%mm0, %%mm6 \n\t" //z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1762 "movq %%mm4, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1763
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1764 "movq %%mm3, %%mm0 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1765 "psubw %%mm5, %%mm4 \n\t" //z12
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1766
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1767 "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1768 "paddw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1769
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1770 "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1771 "paddw %%mm5, %%mm2 \n\t" //z11 >
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1772
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1773 "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1774 "movq %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1775
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1776 "psubw %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1777 "paddw %%mm6, %%mm5 \n\t" //t7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1778
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1779 "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1780 "paddw %%mm3, %%mm0 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1781
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1782 "psllw $3, %%mm0 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1783 "psubw %%mm3, %%mm4 \n\t" //t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1784
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1785 "movq 0*8+%3, %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1786 "movq %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1787
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1788 "psllw $3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1789 "psubw %%mm5, %%mm0 \n\t" //t6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1790
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1791 "psllw $3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1792 "paddw %%mm0, %%mm1 \n\t" //d1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1793
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1794 "psubw %%mm0, %%mm2 \n\t" //t5
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1795 "psubw %%mm0, %%mm3 \n\t" //d6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1796
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1797 "paddw %%mm2, %%mm4 \n\t" //t4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1798 "movq %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1799
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1800 "paddw %%mm2, %%mm7 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1801 "psubw %%mm2, %%mm0 \n\t" //d5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1802
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1803 "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1804 "psubw %%mm5, %%mm6 \n\t" //d7
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1805
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1806 "paddw 0*8+%3, %%mm5 \n\t" //d0
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1807 "paddw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1808
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1809 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1810 "psraw $3, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1811
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1812 "paddw %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1813 "psraw $3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1814
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1815 "paddw (%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1816 "psraw $3, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1817
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1818 "paddw (%%"REG_D",%%"REG_a",), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1819 "paddw %%mm2, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1820
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1821 "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1822 "paddw %%mm2, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1823
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1824 "movq %%mm5, (%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1825 "paddw %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1826
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1827 "movq %%mm1, (%%"REG_D",%%"REG_a",) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1828 "psraw $3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1829
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1830 "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1831 "add %%"REG_d", %%"REG_D" \n\t" //3*ls
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1832
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1833 "movq 1*8+%3, %%mm5 \n\t" //t3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1834 "psraw $3, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1835
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1836 "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1837 "psubw %%mm4, %%mm5 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1838
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1839 "paddw (%%"REG_D",%%"REG_d",), %%mm3 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1840 "psraw $3, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1841
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1842 "paddw 1*8+%3, %%mm4 \n\t" //d4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1843 "paddw %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1844
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1845 "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1846 "paddw %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1847
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1848 "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1849 "psraw $3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1850
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1851 "paddw (%%"REG_D"), %%mm5 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1852 "psraw $3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1853
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1854 "paddw (%%"REG_D",%%"REG_a",), %%mm4 \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1855 "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1856
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1857 "movq %%mm3, (%%"REG_D",%%"REG_d",) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1858 "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1859 "movq %%mm5, (%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1860 "movq %%mm4, (%%"REG_D",%%"REG_a",) \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1861
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1862 "sub %%"REG_d", %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1863 "add $8, %%"REG_D" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1864 "dec %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1865 "jnz 1b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1866
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1867 : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1868 : "a"(output_stride*sizeof(short))
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1869 : "%"REG_d
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1870 );
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1871 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1872
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1873 #endif // HAVE_MMX
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1874
28290
25337a2147e7 Lots and lots of #ifdef ARCH_... -> #if ARCH_...
reimar
parents: 27754
diff changeset
1875 #if !HAVE_MMX
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1876
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1877 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1878 {
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1879 int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1880 int_simd16_t tmp10, tmp11, tmp12, tmp13;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1881 int_simd16_t z1, z2, z3, z4, z5, z11, z13;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1882 DCTELEM *dataptr;
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1883
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1884 cnt*=4;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1885 // Pass 1: process rows.
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1886
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1887 dataptr = data;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1888 for (; cnt > 0; cnt--) {
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1889 tmp0 = pixels[line_size*0] + pixels[line_size*7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1890 tmp7 = pixels[line_size*0] - pixels[line_size*7];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1891 tmp1 = pixels[line_size*1] + pixels[line_size*6];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1892 tmp6 = pixels[line_size*1] - pixels[line_size*6];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1893 tmp2 = pixels[line_size*2] + pixels[line_size*5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1894 tmp5 = pixels[line_size*2] - pixels[line_size*5];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1895 tmp3 = pixels[line_size*3] + pixels[line_size*4];
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1896 tmp4 = pixels[line_size*3] - pixels[line_size*4];
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1897
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1898 // Even part
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1899
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1900 tmp10 = tmp0 + tmp3;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1901 tmp13 = tmp0 - tmp3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1902 tmp11 = tmp1 + tmp2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1903 tmp12 = tmp1 - tmp2;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1904 //Even columns are written first, this leads to different order of columns
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1905 //in column_fidct(), but they are processed independently, so all ok.
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1906 //Later in the row_idct() columns readed at the same order.
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1907 dataptr[2] = tmp10 + tmp11;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1908 dataptr[3] = tmp10 - tmp11;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1909
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1910 z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1911 dataptr[0] = tmp13 + z1;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1912 dataptr[1] = tmp13 - z1;
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1913
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1914 // Odd part
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1915
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1916 tmp10 = (tmp4 + tmp5) <<2;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1917 tmp11 = (tmp5 + tmp6) <<2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1918 tmp12 = (tmp6 + tmp7) <<2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1919
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1920 z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1921 z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1922 z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1923 z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1924
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1925 z11 = tmp7 + z3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1926 z13 = tmp7 - z3;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1927
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1928 dataptr[4] = z13 + z2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1929 dataptr[5] = z13 - z2;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1930 dataptr[6] = z11 + z4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1931 dataptr[7] = z11 - z4;
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1932
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1933 pixels++; // advance pointer to next column
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1934 dataptr += DCTSIZE;
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1935 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1936 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1937
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1938 #else /* HAVE_MMX */
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1939
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1940 static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1941 {
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1942 uint64_t __attribute__((aligned(8))) temps[4];
27754
08d18fe9da52 Change all occurrences of asm and __asm to __asm__, same as was done for FFmpeg.
diego
parents: 26727
diff changeset
1943 __asm__ volatile(
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1944 "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1945 "6: \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1946 "movd (%%"REG_S"), %%mm0 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1947 "pxor %%mm7, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1948
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1949 "movd (%%"REG_S",%%"REG_a",), %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1950 "punpcklbw %%mm7, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1951
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1952 "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1953 "punpcklbw %%mm7, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1954
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1955 "punpcklbw %%mm7, %%mm2 \n\t"
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1956 "add %%"REG_d", %%"REG_S" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1957
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1958 "movq %%mm0, %%mm5 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
1959 //
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1960
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1961 "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch!
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1962 "movq %%mm1, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1963
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1964 "movd (%%"REG_S",%%"REG_d",), %%mm4 \n\t" //6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1965 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1966
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1967 "psubw %%mm3, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1968 "punpcklbw %%mm7, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1969
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1970 "paddw %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1971 "psubw %%mm4, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1972
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1973 "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1974 "paddw %%mm4, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1975
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1976 "movq %%mm5, 0*8+%3 \n\t" //t7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1977 "punpcklbw %%mm7, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1978
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
1979 "movq %%mm6, 1*8+%3 \n\t" //t6
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1980 "movq %%mm2, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1981
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1982 "movd (%%"REG_S"), %%mm5 \n\t" //3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1983 "paddw %%mm3, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1984
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
1985 "movd (%%"REG_S",%%"REG_a",), %%mm6 \n\t" //4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1986 "punpcklbw %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1987
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1988 "psubw %%mm3, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1989 "punpcklbw %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1990
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1991 "movq %%mm5, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1992 "paddw %%mm6, %%mm5 \n\t" //t3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1993
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1994 "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - -
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1995 "movq %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1996
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1997 "movq %%mm1, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1998 "psubw %%mm5, %%mm0 \n\t" //t13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
1999
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2000 "psubw %%mm2, %%mm1 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2001 "paddw %%mm2, %%mm7 \n\t" //t11
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2002
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2003 "paddw %%mm0, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2004 "movq %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2005
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2006 "psllw $2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2007 "paddw %%mm5, %%mm6 \n\t" //t10
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2008
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2009 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2010 "paddw %%mm6, %%mm7 \n\t" //d2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2011
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2012 "psubw %%mm2, %%mm6 \n\t" //d3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2013 "movq %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2014
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2015 //transpose 4x4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2016 "movq %%mm7, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2017 "punpcklwd %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2018
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2019 "paddw %%mm1, %%mm0 \n\t" //d0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2020 "punpckhwd %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2021
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2022 "psubw %%mm1, %%mm5 \n\t" //d1
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2023 "movq %%mm0, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2024
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
2025 "movq 1*8+%3, %%mm1 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2026 "punpcklwd %%mm5, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2027
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2028 "punpckhwd %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2029 "movq %%mm0, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2030
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2031 "punpckldq %%mm7, %%mm0 \n\t" //0
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2032 "paddw %%mm4, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2033
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2034 "punpckhdq %%mm7, %%mm5 \n\t" //1
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2035 "movq %%mm6, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2036
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2037 "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2038 "punpckldq %%mm2, %%mm6 \n\t" //2
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2039
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2040 "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2041 "punpckhdq %%mm2, %%mm7 \n\t" //3
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2042
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2043 "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2044 "paddw %%mm1, %%mm4 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2045
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2046 "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2047 "psllw $2, %%mm3 \n\t" //t10
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2048
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
2049 "movq 0*8+%3, %%mm2 \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2050 "psllw $2, %%mm4 \n\t" //t11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2051
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2052 "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t" //z3
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2053 "paddw %%mm2, %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2054
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2055 "psllw $2, %%mm1 \n\t" //t12
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2056 "movq %%mm3, %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2057
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2058 "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2059 "psubw %%mm1, %%mm3 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2060
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2061 "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2062 "movq %%mm2, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2063
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2064 "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2065 "psubw %%mm4, %%mm2 \n\t" //z13
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2066
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2067 "paddw %%mm4, %%mm5 \n\t" //z11
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2068 "movq %%mm2, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2069
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2070 "paddw %%mm3, %%mm0 \n\t" //z2
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2071 "movq %%mm5, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2072
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2073 "paddw %%mm0, %%mm2 \n\t" //d4
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2074 "psubw %%mm0, %%mm6 \n\t" //d5
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2075
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2076 "movq %%mm2, %%mm4 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2077 "paddw %%mm3, %%mm1 \n\t" //z4
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2078
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2079 //transpose 4x4
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2080 "punpcklwd %%mm6, %%mm2 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2081 "paddw %%mm1, %%mm5 \n\t" //d6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2082
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2083 "punpckhwd %%mm6, %%mm4 \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2084 "psubw %%mm1, %%mm7 \n\t" //d7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2085
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2086 "movq %%mm5, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2087 "punpcklwd %%mm7, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2088
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2089 "punpckhwd %%mm7, %%mm6 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2090 "movq %%mm2, %%mm7 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2091
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2092 "punpckldq %%mm5, %%mm2 \n\t" //4
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2093 "sub %%"REG_d", %%"REG_S" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2094
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2095 "punpckhdq %%mm5, %%mm7 \n\t" //5
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2096 "movq %%mm4, %%mm5 \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2097
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2098 "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2099 "punpckldq %%mm6, %%mm4 \n\t" //6
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2100
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2101 "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2102 "punpckhdq %%mm6, %%mm5 \n\t" //7
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2103
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2104 "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2105 "add $4, %%"REG_S" \n\t"
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2106
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2107 "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
29263
0f1b5b68af32 whitespace cosmetics: Remove all trailing whitespace.
diego
parents: 29087
diff changeset
2108 "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2109 "dec %%"REG_c" \n\t"
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2110 "jnz 6b \n\t"
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2111
26052
ce480034f391 Do not use a global temps variable, this is ugly and does not compile with ICC.
reimar
parents: 26050
diff changeset
2112 : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
15632
e813a3e431a8 move unchanged registers back to input spec
henry
parents: 15631
diff changeset
2113 : "a"(line_size)
15634
7eddcf69a5fd x86-64 fixes by Reimar
henry
parents: 15633
diff changeset
2114 : "%"REG_d);
15631
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2115 }
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2116
d5a95e6f5f07 faster spp filter by Nikolaj Poroshin <porosh3 at psu ru>
henry
parents:
diff changeset
2117 #endif // HAVE_MMX