annotate postproc/rgb2rgb_template.c @ 2746:dece635a28e3

Minor speedup of rgb32to24. (performance is not successful)
author nick
date Tue, 06 Nov 2001 17:14:22 +0000
parents b8a692c59b64
children 2f93f4351765
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
1 /*
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
2 *
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
3 * rgb2rgb.c, Software RGB to RGB convertor
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents: 2725
diff changeset
4 * pluralize by Software PAL8 to RGB convertor
ae79207a3055 Move yuv2rgb to postprocess
nick
parents: 2725
diff changeset
5 * Software YUV to YUV convertor
ae79207a3055 Move yuv2rgb to postprocess
nick
parents: 2725
diff changeset
6 * Software YUV to RGB convertor
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
7 * Written by Nick Kurshev.
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
8 * palette stuff & yuv stuff by Michael
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
9 */
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
10 #include <inttypes.h>
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
11 #include "../config.h"
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
12 #include "rgb2rgb.h"
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
13 #include "../mmx_defs.h"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
14
2535
b44113f46c96 cant compile on non x86 bugfix
michael
parents: 2517
diff changeset
15 #ifdef HAVE_MMX
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
16 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
17 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
18 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
19 static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
20 static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
21 static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
22 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
23 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
24 static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
25 static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
26 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
27 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
28 static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
29 static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
30 static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
2535
b44113f46c96 cant compile on non x86 bugfix
michael
parents: 2517
diff changeset
31 #endif
2513
nick
parents: 2512
diff changeset
32
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
33 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
34 {
2508
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
35 uint8_t *dest = dst;
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
36 const uint8_t *s = src;
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
37 const uint8_t *end;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
38 #ifdef HAVE_MMX
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
39 uint8_t *mm_end;
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
40 #endif
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
41 end = s + src_size;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
42 #ifdef HAVE_MMX
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
43 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
44 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
45 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
46 if(mm_end == end) mm_end -= MMREG_SIZE*4;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
47 while(s < mm_end)
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
48 {
2511
6db23dd30242 mmx, mmx2, 3dnow optimized 24to32
nick
parents: 2510
diff changeset
49 __asm __volatile(
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
50 PREFETCH" 32%1\n\t"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
51 "movd %1, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
52 "punpckldq 3%1, %%mm0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
53 "movd 6%1, %%mm1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
54 "punpckldq 9%1, %%mm1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
55 "movd 12%1, %%mm2\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
56 "punpckldq 15%1, %%mm2\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
57 "movd 18%1, %%mm3\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
58 "punpckldq 21%1, %%mm3\n\t"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
59 "pand %%mm7, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
60 "pand %%mm7, %%mm1\n\t"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
61 "pand %%mm7, %%mm2\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
62 "pand %%mm7, %%mm3\n\t"
2511
6db23dd30242 mmx, mmx2, 3dnow optimized 24to32
nick
parents: 2510
diff changeset
63 MOVNTQ" %%mm0, %0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
64 MOVNTQ" %%mm1, 8%0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
65 MOVNTQ" %%mm2, 16%0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
66 MOVNTQ" %%mm3, 24%0"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
67 :"=m"(*dest)
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
68 :"m"(*s)
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
69 :"memory");
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
70 dest += 32;
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
71 s += 24;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
72 }
2513
nick
parents: 2512
diff changeset
73 __asm __volatile(SFENCE:::"memory");
2511
6db23dd30242 mmx, mmx2, 3dnow optimized 24to32
nick
parents: 2510
diff changeset
74 __asm __volatile(EMMS:::"memory");
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
75 #endif
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
76 while(s < end)
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
77 {
2508
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
78 *dest++ = *s++;
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
79 *dest++ = *s++;
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
80 *dest++ = *s++;
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
81 *dest++ = 0;
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
82 }
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
83 }
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
84
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
85 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
86 {
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
87 uint8_t *dest = dst;
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
88 const uint8_t *s = src;
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
89 const uint8_t *end;
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
90 #ifdef HAVE_MMX
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
91 uint8_t *mm_end;
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
92 #endif
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
93 end = s + src_size;
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
94 #ifdef HAVE_MMX
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
96 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
97 while(s < mm_end)
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
98 {
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
99 __asm __volatile(
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
100 PREFETCH" 32%1\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
101 "movq %1, %%mm0\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
102 "movq 8%1, %%mm1\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
103 "movq 16%1, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
104 "movq 24%1, %%mm5\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
105 "movq %%mm0, %%mm2\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
106 "movq %%mm1, %%mm3\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
107 "movq %%mm4, %%mm6\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
108 "movq %%mm5, %%mm7\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
109 "psrlq $8, %%mm2\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
110 "psrlq $8, %%mm3\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
111 "psrlq $8, %%mm6\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
112 "psrlq $8, %%mm7\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
113 "pand %2, %%mm0\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
114 "pand %2, %%mm1\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
115 "pand %2, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
116 "pand %2, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
117 "pand %3, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
118 "pand %3, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
119 "pand %3, %%mm6\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
120 "pand %3, %%mm7\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
121 "por %%mm2, %%mm0\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
122 "por %%mm3, %%mm1\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
123 "por %%mm6, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
124 "por %%mm7, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
125
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
126 "movq %%mm1, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
127 "movq %%mm4, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
128 "psllq $48, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
129 "psllq $32, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
130 "pand %4, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
131 "pand %5, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
132 "por %%mm2, %%mm0\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
133 "psrlq $16, %%mm1\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
134 "psrlq $32, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
135 "psllq $16, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
136 "por %%mm3, %%mm1\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
137 "pand %6, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
138 "por %%mm5, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
139
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
140 MOVNTQ" %%mm0, %0\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
141 MOVNTQ" %%mm1, 8%0\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
142 MOVNTQ" %%mm4, 16%0"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
143 :"=m"(*dest)
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
144 :"m"(*s),"m"(mask24l),
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
145 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
146 :"memory");
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
147 dest += 24;
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
148 s += 32;
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
149 }
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
150 __asm __volatile(SFENCE:::"memory");
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
151 __asm __volatile(EMMS:::"memory");
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
152 #endif
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
153 while(s < end)
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
154 {
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
155 *dest++ = *s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
156 *dest++ = *s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
157 *dest++ = *s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
158 s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
159 }
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
160 }
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
161
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
162 /*
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
163 Original by Strepto/Astral
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
164 ported to gcc & bugfixed : A'rpi
2564
3d04a0991dce cosmetic
nick
parents: 2538
diff changeset
165 MMX2, 3DNOW optimization by Nick Kurshev
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
166 32bit c version, and and&add trick by Michael Niedermayer
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
167 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
168 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
169 {
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
170 #ifdef HAVE_MMX
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
171 register const char* s=src+src_size;
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
172 register char* d=dst+src_size;
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
173 register int offs=-src_size;
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
174 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
175 __asm __volatile(
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
176 "movq %0, %%mm4\n\t"
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
177 ::"m"(mask15s));
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
178 while(offs<0)
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
179 {
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
180 __asm __volatile(
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
181 PREFETCH" 32%1\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
182 "movq %1, %%mm0\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
183 "movq 8%1, %%mm2\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
184 "movq %%mm0, %%mm1\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
185 "movq %%mm2, %%mm3\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
186 "pand %%mm4, %%mm0\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
187 "pand %%mm4, %%mm2\n\t"
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
188 "paddw %%mm1, %%mm0\n\t"
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
189 "paddw %%mm3, %%mm2\n\t"
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
190 MOVNTQ" %%mm0, %0\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
191 MOVNTQ" %%mm2, 8%0"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
192 :"=m"(*(d+offs))
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
193 :"m"(*(s+offs))
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
194 );
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
195 offs+=16;
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
196 }
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
197 __asm __volatile(SFENCE:::"memory");
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
198 __asm __volatile(EMMS:::"memory");
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
199 #else
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
200 #if 0
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
201 const uint16_t *s1=( uint16_t * )src;
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
202 uint16_t *d1=( uint16_t * )dst;
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
203 uint16_t *e=((uint8_t *)s1)+src_size;
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
204 while( s1<e ){
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
205 register int x=*( s1++ );
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
206 /* rrrrrggggggbbbbb
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
207 0rrrrrgggggbbbbb
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
208 0111 1111 1110 0000=0x7FE0
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
209 00000000000001 1111=0x001F */
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
210 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
211 }
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
212 #else
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
213 const unsigned *s1=( unsigned * )src;
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
214 unsigned *d1=( unsigned * )dst;
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
215 int i;
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
216 int size= src_size>>2;
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
217 for(i=0; i<size; i++)
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
218 {
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
219 register int x= s1[i];
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
220 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
221 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
222
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
223 }
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
224 #endif
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
225 #endif
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
226 }
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
227
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
228 /**
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
229 * Pallete is assumed to contain bgr32
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
230 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
231 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
232 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
233 unsigned i;
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
234 for(i=0; i<num_pixels; i++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
235 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
236 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
237
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
238 /**
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
239 * Pallete is assumed to contain bgr32
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
240 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
241 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
242 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
243 unsigned i;
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
244 /*
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
245 writes 1 byte o much and might cause alignment issues on some architectures?
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
246 for(i=0; i<num_pixels; i++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
247 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
248 */
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
249 for(i=0; i<num_pixels; i++)
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
250 {
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
251 //FIXME slow?
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
252 dst[0]= palette[ src[i]*4+0 ];
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
253 dst[1]= palette[ src[i]*4+1 ];
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
254 dst[2]= palette[ src[i]*4+2 ];
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
255 dst+= 3;
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
256 }
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
257 }
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
258
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
259 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
260 {
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
261 #ifdef HAVE_MMX
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
262 const uint8_t *s = src;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
263 const uint8_t *end,*mm_end;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
264 uint16_t *d = (uint16_t *)dst;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
265 end = s + src_size;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
266 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
267 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
268 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
269 "movq %0, %%mm7\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
270 "movq %1, %%mm6\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
271 ::"m"(red_16mask),"m"(green_16mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
272 while(s < mm_end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
273 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
274 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
275 PREFETCH" 32%1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
276 "movd %1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
277 "movd 4%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
278 "punpckldq 8%1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
279 "punpckldq 12%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
280 "movq %%mm0, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
281 "movq %%mm0, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
282 "movq %%mm3, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
283 "movq %%mm3, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
284 "psrlq $3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
285 "psrlq $3, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
286 "pand %2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
287 "pand %2, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
288 "psrlq $5, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
289 "psrlq $5, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
290 "pand %%mm6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
291 "pand %%mm6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
292 "psrlq $8, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
293 "psrlq $8, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
294 "pand %%mm7, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
295 "pand %%mm7, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
296 "por %%mm1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
297 "por %%mm4, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
298 "por %%mm2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
299 "por %%mm5, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
300 "psllq $16, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
301 "por %%mm3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
302 MOVNTQ" %%mm0, %0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
303 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
304 d += 4;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
305 s += 16;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
306 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
307 while(s < end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
308 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
309 const int b= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
310 const int g= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
311 const int r= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
312 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
313 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
314 __asm __volatile(SFENCE:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
315 __asm __volatile(EMMS:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
316 #else
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
317 unsigned j,i,num_pixels=src_size/4;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
318 uint16_t *d = (uint16_t *)dst;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
319 for(i=0,j=0; j<num_pixels; i+=4,j++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
320 {
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
321 const int b= src[i+0];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
322 const int g= src[i+1];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
323 const int r= src[i+2];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
324
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
325 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
326 }
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
327 #endif
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
328 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
329
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
330 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
331 {
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
332 #ifdef HAVE_MMX
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
333 const uint8_t *s = src;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
334 const uint8_t *end,*mm_end;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
335 uint16_t *d = (uint16_t *)dst;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
336 end = s + src_size;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
337 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
338 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
339 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
340 "movq %0, %%mm7\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
341 "movq %1, %%mm6\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
342 ::"m"(red_15mask),"m"(green_15mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
343 while(s < mm_end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
344 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
345 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
346 PREFETCH" 32%1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
347 "movd %1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
348 "movd 4%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
349 "punpckldq 8%1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
350 "punpckldq 12%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
351 "movq %%mm0, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
352 "movq %%mm0, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
353 "movq %%mm3, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
354 "movq %%mm3, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
355 "psrlq $3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
356 "psrlq $3, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
357 "pand %2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
358 "pand %2, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
359 "psrlq $6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
360 "psrlq $6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
361 "pand %%mm6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
362 "pand %%mm6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
363 "psrlq $9, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
364 "psrlq $9, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
365 "pand %%mm7, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
366 "pand %%mm7, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
367 "por %%mm1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
368 "por %%mm4, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
369 "por %%mm2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
370 "por %%mm5, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
371 "psllq $16, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
372 "por %%mm3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
373 MOVNTQ" %%mm0, %0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
374 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
375 d += 4;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
376 s += 16;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
377 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
378 while(s < end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
379 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
380 const int b= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
381 const int g= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
382 const int r= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
383 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
384 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
385 __asm __volatile(SFENCE:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
386 __asm __volatile(EMMS:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
387 #else
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
388 unsigned j,i,num_pixels=src_size/4;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
389 uint16_t *d = (uint16_t *)dst;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
390 for(i=0,j=0; j<num_pixels; i+=4,j++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
391 {
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
392 const int b= src[i+0];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
393 const int g= src[i+1];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
394 const int r= src[i+2];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
395
2720
4ba64e254042 Fixed rgb32(24)to15 stuff
nick
parents: 2719
diff changeset
396 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
397 }
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
398 #endif
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
399 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
400
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
401 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
402 {
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
403 #ifdef HAVE_MMX
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
404 const uint8_t *s = src;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
405 const uint8_t *end,*mm_end;
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
406 uint16_t *d = (uint16_t *)dst;
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
407 end = s + src_size;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
408 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
409 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
410 __asm __volatile(
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
411 "movq %0, %%mm7\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
412 "movq %1, %%mm6\n\t"
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
413 ::"m"(red_16mask),"m"(green_16mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
414 if(mm_end == end) mm_end -= MMREG_SIZE*2;
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
415 while(s < mm_end)
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
416 {
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
417 __asm __volatile(
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
418 PREFETCH" 32%1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
419 "movd %1, %%mm0\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
420 "movd 3%1, %%mm3\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
421 "punpckldq 6%1, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
422 "punpckldq 9%1, %%mm3\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
423 "movq %%mm0, %%mm1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
424 "movq %%mm0, %%mm2\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
425 "movq %%mm3, %%mm4\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
426 "movq %%mm3, %%mm5\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
427 "psrlq $3, %%mm0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
428 "psrlq $3, %%mm3\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
429 "pand %2, %%mm0\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
430 "pand %2, %%mm3\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
431 "psrlq $5, %%mm1\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
432 "psrlq $5, %%mm4\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
433 "pand %%mm6, %%mm1\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
434 "pand %%mm6, %%mm4\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
435 "psrlq $8, %%mm2\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
436 "psrlq $8, %%mm5\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
437 "pand %%mm7, %%mm2\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
438 "pand %%mm7, %%mm5\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
439 "por %%mm1, %%mm0\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
440 "por %%mm4, %%mm3\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
441 "por %%mm2, %%mm0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
442 "por %%mm5, %%mm3\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
443 "psllq $16, %%mm3\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
444 "por %%mm3, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
445 MOVNTQ" %%mm0, %0\n\t"
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
446 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
447 d += 4;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
448 s += 12;
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
449 }
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
450 while(s < end)
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
451 {
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
452 const int b= *s++;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
453 const int g= *s++;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
454 const int r= *s++;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
455 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
456 }
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
457 __asm __volatile(SFENCE:::"memory");
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
458 __asm __volatile(EMMS:::"memory");
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
459 #else
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
460 unsigned j,i,num_pixels=src_size/3;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
461 uint16_t *d = (uint16_t *)dst;
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
462 for(i=0,j=0; j<num_pixels; i+=3,j++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
463 {
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
464 const int b= src[i+0];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
465 const int g= src[i+1];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
466 const int r= src[i+2];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
467
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
468 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
469 }
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
470 #endif
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
471 }
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
472
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
473 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
474 {
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
475 #ifdef HAVE_MMX
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
476 const uint8_t *s = src;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
477 const uint8_t *end,*mm_end;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
478 uint16_t *d = (uint16_t *)dst;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
479 end = s + src_size;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
480 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
481 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
482 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
483 "movq %0, %%mm7\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
484 "movq %1, %%mm6\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
485 ::"m"(red_15mask),"m"(green_15mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
486 if(mm_end == end) mm_end -= MMREG_SIZE*2;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
487 while(s < mm_end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
488 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
489 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
490 PREFETCH" 32%1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
491 "movd %1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
492 "movd 3%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
493 "punpckldq 6%1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
494 "punpckldq 9%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
495 "movq %%mm0, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
496 "movq %%mm0, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
497 "movq %%mm3, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
498 "movq %%mm3, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
499 "psrlq $3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
500 "psrlq $3, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
501 "pand %2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
502 "pand %2, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
503 "psrlq $6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
504 "psrlq $6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
505 "pand %%mm6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
506 "pand %%mm6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
507 "psrlq $9, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
508 "psrlq $9, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
509 "pand %%mm7, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
510 "pand %%mm7, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
511 "por %%mm1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
512 "por %%mm4, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
513 "por %%mm2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
514 "por %%mm5, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
515 "psllq $16, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
516 "por %%mm3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
517 MOVNTQ" %%mm0, %0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
518 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
519 d += 4;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
520 s += 12;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
521 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
522 while(s < end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
523 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
524 const int b= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
525 const int g= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
526 const int r= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
527 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
528 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
529 __asm __volatile(SFENCE:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
530 __asm __volatile(EMMS:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
531 #else
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
532 unsigned j,i,num_pixels=src_size/3;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
533 uint16_t *d = (uint16_t *)dst;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
534 for(i=0,j=0; j<num_pixels; i+=3,j++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
535 {
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
536 const int b= src[i+0];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
537 const int g= src[i+1];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
538 const int r= src[i+2];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
539
2720
4ba64e254042 Fixed rgb32(24)to15 stuff
nick
parents: 2719
diff changeset
540 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
541 }
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
542 #endif
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
543 }
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
544
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
545 /**
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
546 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
547 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
548 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
549 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
550 unsigned i;
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
551 for(i=0; i<num_pixels; i++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
552 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
553 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
554
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
555 /**
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
556 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
557 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
558 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
559 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
560 unsigned i;
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
561 for(i=0; i<num_pixels; i++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
562 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
563 }
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
564 /**
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
565 *
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
566 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
567 * problem for anyone then tell me, and ill fix it)
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
568 */
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
569 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
570 unsigned int width, unsigned int height,
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
571 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
572 {
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
573 int y;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
574 const int chromWidth= width>>1;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
575 for(y=0; y<height; y++)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
576 {
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
577 #ifdef HAVE_MMX
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
578 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
579 asm volatile(
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
580 "xorl %%eax, %%eax \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
581 "1: \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
582 PREFETCH" 32(%1, %%eax, 2) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
583 PREFETCH" 32(%2, %%eax) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
584 PREFETCH" 32(%3, %%eax) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
585 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
586 "movq %%mm0, %%mm2 \n\t" // U(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
587 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
588 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
589 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
590
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
591 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
592 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
593 "movq %%mm3, %%mm4 \n\t" // Y(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
594 "movq %%mm5, %%mm6 \n\t" // Y(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
595 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
596 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
597 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
598 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
599
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
600 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
601 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
602 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
603 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
604
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
605 "addl $8, %%eax \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
606 "cmpl %4, %%eax \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
607 " jb 1b \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
608 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
609 : "%eax"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
610 );
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
611 #else
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
612 int i;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
613 for(i=0; i<chromWidth; i++)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
614 {
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
615 dst[4*i+0] = ysrc[2*i+0];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
616 dst[4*i+1] = usrc[i];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
617 dst[4*i+2] = ysrc[2*i+1];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
618 dst[4*i+3] = vsrc[i];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
619 }
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
620 #endif
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
621 if(y&1)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
622 {
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
623 usrc += chromStride;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
624 vsrc += chromStride;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
625 }
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
626 ysrc += lumStride;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
627 dst += dstStride;
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
628 }
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
629 #ifdef HAVE_MMX
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
630 asm( EMMS" \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
631 SFENCE" \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
632 :::"memory");
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
633 #endif
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
634 }
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
635
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
636 /**
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
637 *
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
638 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
639 * problem for anyone then tell me, and ill fix it)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
640 */
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
641 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
642 unsigned int width, unsigned int height,
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
643 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
644 {
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
645 int y;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
646 const int chromWidth= width>>1;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
647 for(y=0; y<height; y+=2)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
648 {
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
649 #ifdef HAVE_MMX
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
650 asm volatile(
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
651 "xorl %%eax, %%eax \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
652 "pcmpeqw %%mm7, %%mm7 \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
653 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
654 "1: \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
655 PREFETCH" 64(%0, %%eax, 4) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
656 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
657 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
658 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
659 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
660 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
661 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
662 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
663 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
664 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
665 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
666
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
667 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
668
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
669 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
670 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
671 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
672 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
673 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
674 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
675 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
676 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
677 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
678 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
679
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
680 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
681
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
682 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
683 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
684 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
685 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
686 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
687 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
688 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
689 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
690
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
691 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
692 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
693
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
694 "addl $8, %%eax \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
695 "cmpl %4, %%eax \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
696 " jb 1b \n\t"
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
697 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
698 : "memory", "%eax"
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
699 );
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
700
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
701 asm volatile(
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
702 "xorl %%eax, %%eax \n\t"
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
703 "1: \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
704 PREFETCH" 64(%0, %%eax, 4) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
705 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
706 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
707 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
708 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
709 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
710 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
711 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
712 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
713 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
714 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
715
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
716 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
717 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
718
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
719 "addl $8, %%eax \n\t"
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
720 "cmpl %4, %%eax \n\t"
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
721 " jb 1b \n\t"
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
722
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
723 ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth)
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
724 : "memory", "%eax"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
725 );
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
726 #else
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
727 int i;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
728 for(i=0; i<chromWidth; i++)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
729 {
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
730 ydst[2*i+0] = src[4*i+0];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
731 udst[i] = src[4*i+1];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
732 ydst[2*i+1] = src[4*i+2];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
733 vdst[i] = src[4*i+3];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
734 }
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
735 ydst += lumStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
736 src += srcStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
737
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
738 for(i=0; i<chromWidth; i++)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
739 {
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
740 ydst[2*i+0] = src[4*i+0];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
741 ydst[2*i+1] = src[4*i+2];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
742 }
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
743 #endif
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
744 udst += chromStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
745 vdst += chromStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
746 ydst += lumStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
747 src += srcStride;
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
748 }
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
749 #ifdef HAVE_MMX
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
750 asm( EMMS" \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
751 SFENCE" \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
752 :::"memory");
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
753 #endif
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
754 }