annotate postproc/rgb2rgb.c @ 2845:b512c5b40b0d

c++ compiler would also be handy during build
author eyck
date Sun, 11 Nov 2001 21:08:30 +0000
parents cbb62e07bc0e
children 1d92268eb8fc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
1 /*
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
2 *
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
3 * rgb2rgb.c, Software RGB to RGB convertor
2732
ae79207a3055 Move yuv2rgb to postprocess
nick
parents: 2725
diff changeset
4 * pluralize by Software PAL8 to RGB convertor
ae79207a3055 Move yuv2rgb to postprocess
nick
parents: 2725
diff changeset
5 * Software YUV to YUV convertor
ae79207a3055 Move yuv2rgb to postprocess
nick
parents: 2725
diff changeset
6 * Software YUV to RGB convertor
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
7 * Written by Nick Kurshev.
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
8 * palette stuff & yuv stuff by Michael
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
9 */
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
10 #include <inttypes.h>
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
11 #include "../config.h"
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
12 #include "rgb2rgb.h"
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
13 #include "../mmx_defs.h"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
14
2535
b44113f46c96 cant compile on non x86 bugfix
michael
parents: 2517
diff changeset
15 #ifdef HAVE_MMX
2755
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
16 static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
17 static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
18 static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
19 static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
20 static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
21 static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
22 static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL;
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
23 static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL;
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
24 static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
25 static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
26 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
27 static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
28 static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
29 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
30 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
31 static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
32 static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
33 static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
2755
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
34 #if 0
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
35 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
36 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
37 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
38 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
39
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
40 static uint64_t __attribute__((aligned(8))) dither4[2]={
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
41 0x0103010301030103LL,
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
42 0x0200020002000200LL,};
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
43
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
44 static uint64_t __attribute__((aligned(8))) dither8[2]={
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
45 0x0602060206020602LL,
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
46 0x0004000400040004LL,};
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
47 #endif
2535
b44113f46c96 cant compile on non x86 bugfix
michael
parents: 2517
diff changeset
48 #endif
2513
nick
parents: 2512
diff changeset
49
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
50 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
51 {
2508
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
52 uint8_t *dest = dst;
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
53 const uint8_t *s = src;
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
54 const uint8_t *end;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
55 #ifdef HAVE_MMX
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
56 uint8_t *mm_end;
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
57 #endif
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
58 end = s + src_size;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
59 #ifdef HAVE_MMX
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
60 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
61 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
62 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
63 if(mm_end == end) mm_end -= MMREG_SIZE*4;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
64 while(s < mm_end)
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
65 {
2511
6db23dd30242 mmx, mmx2, 3dnow optimized 24to32
nick
parents: 2510
diff changeset
66 __asm __volatile(
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
67 PREFETCH" 32%1\n\t"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
68 "movd %1, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
69 "punpckldq 3%1, %%mm0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
70 "movd 6%1, %%mm1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
71 "punpckldq 9%1, %%mm1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
72 "movd 12%1, %%mm2\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
73 "punpckldq 15%1, %%mm2\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
74 "movd 18%1, %%mm3\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
75 "punpckldq 21%1, %%mm3\n\t"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
76 "pand %%mm7, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
77 "pand %%mm7, %%mm1\n\t"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
78 "pand %%mm7, %%mm2\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
79 "pand %%mm7, %%mm3\n\t"
2511
6db23dd30242 mmx, mmx2, 3dnow optimized 24to32
nick
parents: 2510
diff changeset
80 MOVNTQ" %%mm0, %0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
81 MOVNTQ" %%mm1, 8%0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
82 MOVNTQ" %%mm2, 16%0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
83 MOVNTQ" %%mm3, 24%0"
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
84 :"=m"(*dest)
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
85 :"m"(*s)
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
86 :"memory");
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
87 dest += 32;
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
88 s += 24;
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
89 }
2513
nick
parents: 2512
diff changeset
90 __asm __volatile(SFENCE:::"memory");
2511
6db23dd30242 mmx, mmx2, 3dnow optimized 24to32
nick
parents: 2510
diff changeset
91 __asm __volatile(EMMS:::"memory");
2510
42e1ae2c8f5f mmx optimized 24to32
nick
parents: 2508
diff changeset
92 #endif
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
93 while(s < end)
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
94 {
2508
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
95 *dest++ = *s++;
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
96 *dest++ = *s++;
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
97 *dest++ = *s++;
94f9825a3736 Prev ver could work only on x86
nick
parents: 2506
diff changeset
98 *dest++ = 0;
2504
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
99 }
13e1c5ab417a vo_vesa: rgb2rgb support
nick
parents:
diff changeset
100 }
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
101
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
102 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
103 {
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
104 uint8_t *dest = dst;
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
105 const uint8_t *s = src;
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
106 const uint8_t *end;
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
107 #ifdef HAVE_MMX
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
108 uint8_t *mm_end;
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
109 #endif
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
110 end = s + src_size;
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
111 #ifdef HAVE_MMX
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
112 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
113 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
114 while(s < mm_end)
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
115 {
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
116 __asm __volatile(
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
117 PREFETCH" 32%1\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
118 "movq %1, %%mm0\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
119 "movq 8%1, %%mm1\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
120 "movq 16%1, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
121 "movq 24%1, %%mm5\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
122 "movq %%mm0, %%mm2\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
123 "movq %%mm1, %%mm3\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
124 "movq %%mm4, %%mm6\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
125 "movq %%mm5, %%mm7\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
126 "psrlq $8, %%mm2\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
127 "psrlq $8, %%mm3\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
128 "psrlq $8, %%mm6\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
129 "psrlq $8, %%mm7\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
130 "pand %2, %%mm0\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
131 "pand %2, %%mm1\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
132 "pand %2, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
133 "pand %2, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
134 "pand %3, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
135 "pand %3, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
136 "pand %3, %%mm6\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
137 "pand %3, %%mm7\n\t"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
138 "por %%mm2, %%mm0\n\t"
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
139 "por %%mm3, %%mm1\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
140 "por %%mm6, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
141 "por %%mm7, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
142
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
143 "movq %%mm1, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
144 "movq %%mm4, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
145 "psllq $48, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
146 "psllq $32, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
147 "pand %4, %%mm2\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
148 "pand %5, %%mm3\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
149 "por %%mm2, %%mm0\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
150 "psrlq $16, %%mm1\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
151 "psrlq $32, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
152 "psllq $16, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
153 "por %%mm3, %%mm1\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
154 "pand %6, %%mm5\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
155 "por %%mm5, %%mm4\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
156
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
157 MOVNTQ" %%mm0, %0\n\t"
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
158 MOVNTQ" %%mm1, 8%0\n\t"
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
159 MOVNTQ" %%mm4, 16%0"
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
160 :"=m"(*dest)
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
161 :"m"(*s),"m"(mask24l),
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
162 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
163 :"memory");
2746
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
164 dest += 24;
dece635a28e3 Minor speedup of rgb32to24. (performance is not successful)
nick
parents: 2741
diff changeset
165 s += 32;
2517
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
166 }
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
167 __asm __volatile(SFENCE:::"memory");
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
168 __asm __volatile(EMMS:::"memory");
3d507ef1e3ed 32to24: MMX, MMX2, 3DNOW optimization
nick
parents: 2516
diff changeset
169 #endif
2505
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
170 while(s < end)
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
171 {
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
172 *dest++ = *s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
173 *dest++ = *s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
174 *dest++ = *s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
175 s++;
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
176 }
2aaa11d22f91 vo_vesa: more rgb2rgb support
nick
parents: 2504
diff changeset
177 }
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
178
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
179 /*
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
180 Original by Strepto/Astral
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
181 ported to gcc & bugfixed : A'rpi
2564
3d04a0991dce cosmetic
nick
parents: 2538
diff changeset
182 MMX2, 3DNOW optimization by Nick Kurshev
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
183 32bit c version, and and&add trick by Michael Niedermayer
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
184 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
185 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
186 {
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
187 #ifdef HAVE_MMX
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
188 register const char* s=src+src_size;
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
189 register char* d=dst+src_size;
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
190 register int offs=-src_size;
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
191 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
192 __asm __volatile(
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
193 "movq %0, %%mm4\n\t"
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
194 ::"m"(mask15s));
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
195 while(offs<0)
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
196 {
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
197 __asm __volatile(
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
198 PREFETCH" 32%1\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
199 "movq %1, %%mm0\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
200 "movq 8%1, %%mm2\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
201 "movq %%mm0, %%mm1\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
202 "movq %%mm2, %%mm3\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
203 "pand %%mm4, %%mm0\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
204 "pand %%mm4, %%mm2\n\t"
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
205 "paddw %%mm1, %%mm0\n\t"
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
206 "paddw %%mm3, %%mm2\n\t"
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
207 MOVNTQ" %%mm0, %0\n\t"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
208 MOVNTQ" %%mm2, 8%0"
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
209 :"=m"(*(d+offs))
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
210 :"m"(*(s+offs))
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
211 );
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
212 offs+=16;
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
213 }
2538
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
214 __asm __volatile(SFENCE:::"memory");
71320898b333 Finish mmx2, 3dnow optimiz. 15to16 should be tested. Better fix of can't compile
nick
parents: 2535
diff changeset
215 __asm __volatile(EMMS:::"memory");
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
216 #else
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
217 #if 0
2677
794dec2fae64 using const modifier
nick
parents: 2564
diff changeset
218 const uint16_t *s1=( uint16_t * )src;
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
219 uint16_t *d1=( uint16_t * )dst;
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
220 uint16_t *e=((uint8_t *)s1)+src_size;
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
221 while( s1<e ){
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
222 register int x=*( s1++ );
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
223 /* rrrrrggggggbbbbb
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
224 0rrrrrgggggbbbbb
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
225 0111 1111 1110 0000=0x7FE0
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
226 00000000000001 1111=0x001F */
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
227 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
228 }
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
229 #else
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
230 const unsigned *s1=( unsigned * )src;
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
231 unsigned *d1=( unsigned * )dst;
2698
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
232 int i;
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
233 int size= src_size>>2;
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
234 for(i=0; i<size; i++)
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
235 {
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
236 register int x= s1[i];
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
237 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
238 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
239
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
240 }
22652c028692 faster 15to16 bit rgb (the mmx routine is limited by memory speed so there is no difference ): but the c routine is faster
michael
parents: 2697
diff changeset
241 #endif
2506
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
242 #endif
501752469c39 vo_vesa: more rgb2rgb support
nick
parents: 2505
diff changeset
243 }
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
244
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
245 /**
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
246 * Pallete is assumed to contain bgr32
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
247 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
248 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
249 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
250 unsigned i;
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
251 for(i=0; i<num_pixels; i++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
252 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
253 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
254
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
255 /**
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
256 * Pallete is assumed to contain bgr32
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
257 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
258 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
259 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
260 unsigned i;
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
261 /*
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
262 writes 1 byte o much and might cause alignment issues on some architectures?
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
263 for(i=0; i<num_pixels; i++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
264 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
265 */
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
266 for(i=0; i<num_pixels; i++)
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
267 {
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
268 //FIXME slow?
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
269 dst[0]= palette[ src[i]*4+0 ];
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
270 dst[1]= palette[ src[i]*4+1 ];
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
271 dst[2]= palette[ src[i]*4+2 ];
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
272 dst+= 3;
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
273 }
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
274 }
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
275
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
276 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
277 {
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
278 #ifdef HAVE_MMX
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
279 const uint8_t *s = src;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
280 const uint8_t *end,*mm_end;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
281 uint16_t *d = (uint16_t *)dst;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
282 end = s + src_size;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
283 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
284 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
285 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
286 "movq %0, %%mm7\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
287 "movq %1, %%mm6\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
288 ::"m"(red_16mask),"m"(green_16mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
289 while(s < mm_end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
290 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
291 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
292 PREFETCH" 32%1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
293 "movd %1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
294 "movd 4%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
295 "punpckldq 8%1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
296 "punpckldq 12%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
297 "movq %%mm0, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
298 "movq %%mm0, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
299 "movq %%mm3, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
300 "movq %%mm3, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
301 "psrlq $3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
302 "psrlq $3, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
303 "pand %2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
304 "pand %2, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
305 "psrlq $5, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
306 "psrlq $5, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
307 "pand %%mm6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
308 "pand %%mm6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
309 "psrlq $8, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
310 "psrlq $8, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
311 "pand %%mm7, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
312 "pand %%mm7, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
313 "por %%mm1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
314 "por %%mm4, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
315 "por %%mm2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
316 "por %%mm5, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
317 "psllq $16, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
318 "por %%mm3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
319 MOVNTQ" %%mm0, %0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
320 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
321 d += 4;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
322 s += 16;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
323 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
324 while(s < end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
325 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
326 const int b= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
327 const int g= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
328 const int r= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
329 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
330 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
331 __asm __volatile(SFENCE:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
332 __asm __volatile(EMMS:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
333 #else
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
334 unsigned j,i,num_pixels=src_size/4;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
335 uint16_t *d = (uint16_t *)dst;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
336 for(i=0,j=0; j<num_pixels; i+=4,j++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
337 {
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
338 const int b= src[i+0];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
339 const int g= src[i+1];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
340 const int r= src[i+2];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
341
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
342 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
343 }
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
344 #endif
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
345 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
346
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
347 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
348 {
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
349 #ifdef HAVE_MMX
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
350 const uint8_t *s = src;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
351 const uint8_t *end,*mm_end;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
352 uint16_t *d = (uint16_t *)dst;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
353 end = s + src_size;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
354 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
355 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
356 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
357 "movq %0, %%mm7\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
358 "movq %1, %%mm6\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
359 ::"m"(red_15mask),"m"(green_15mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
360 while(s < mm_end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
361 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
362 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
363 PREFETCH" 32%1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
364 "movd %1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
365 "movd 4%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
366 "punpckldq 8%1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
367 "punpckldq 12%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
368 "movq %%mm0, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
369 "movq %%mm0, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
370 "movq %%mm3, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
371 "movq %%mm3, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
372 "psrlq $3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
373 "psrlq $3, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
374 "pand %2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
375 "pand %2, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
376 "psrlq $6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
377 "psrlq $6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
378 "pand %%mm6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
379 "pand %%mm6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
380 "psrlq $9, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
381 "psrlq $9, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
382 "pand %%mm7, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
383 "pand %%mm7, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
384 "por %%mm1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
385 "por %%mm4, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
386 "por %%mm2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
387 "por %%mm5, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
388 "psllq $16, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
389 "por %%mm3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
390 MOVNTQ" %%mm0, %0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
391 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
392 d += 4;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
393 s += 16;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
394 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
395 while(s < end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
396 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
397 const int b= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
398 const int g= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
399 const int r= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
400 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
401 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
402 __asm __volatile(SFENCE:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
403 __asm __volatile(EMMS:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
404 #else
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
405 unsigned j,i,num_pixels=src_size/4;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
406 uint16_t *d = (uint16_t *)dst;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
407 for(i=0,j=0; j<num_pixels; i+=4,j++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
408 {
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
409 const int b= src[i+0];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
410 const int g= src[i+1];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
411 const int r= src[i+2];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
412
2720
4ba64e254042 Fixed rgb32(24)to15 stuff
nick
parents: 2719
diff changeset
413 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
414 }
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
415 #endif
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
416 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
417
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
418 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
419 {
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
420 #ifdef HAVE_MMX
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
421 const uint8_t *s = src;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
422 const uint8_t *end,*mm_end;
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
423 uint16_t *d = (uint16_t *)dst;
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
424 end = s + src_size;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
425 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
426 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
427 __asm __volatile(
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
428 "movq %0, %%mm7\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
429 "movq %1, %%mm6\n\t"
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
430 ::"m"(red_16mask),"m"(green_16mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
431 if(mm_end == end) mm_end -= MMREG_SIZE*2;
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
432 while(s < mm_end)
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
433 {
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
434 __asm __volatile(
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
435 PREFETCH" 32%1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
436 "movd %1, %%mm0\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
437 "movd 3%1, %%mm3\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
438 "punpckldq 6%1, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
439 "punpckldq 9%1, %%mm3\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
440 "movq %%mm0, %%mm1\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
441 "movq %%mm0, %%mm2\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
442 "movq %%mm3, %%mm4\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
443 "movq %%mm3, %%mm5\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
444 "psrlq $3, %%mm0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
445 "psrlq $3, %%mm3\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
446 "pand %2, %%mm0\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
447 "pand %2, %%mm3\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
448 "psrlq $5, %%mm1\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
449 "psrlq $5, %%mm4\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
450 "pand %%mm6, %%mm1\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
451 "pand %%mm6, %%mm4\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
452 "psrlq $8, %%mm2\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
453 "psrlq $8, %%mm5\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
454 "pand %%mm7, %%mm2\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
455 "pand %%mm7, %%mm5\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
456 "por %%mm1, %%mm0\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
457 "por %%mm4, %%mm3\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
458 "por %%mm2, %%mm0\n\t"
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
459 "por %%mm5, %%mm3\n\t"
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
460 "psllq $16, %%mm3\n\t"
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
461 "por %%mm3, %%mm0\n\t"
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
462 MOVNTQ" %%mm0, %0\n\t"
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
463 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
464 d += 4;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
465 s += 12;
2738
dfa63a7db294 rgb24to32 now is faster
nick
parents: 2732
diff changeset
466 }
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
467 while(s < end)
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
468 {
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
469 const int b= *s++;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
470 const int g= *s++;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
471 const int r= *s++;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
472 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
473 }
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
474 __asm __volatile(SFENCE:::"memory");
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
475 __asm __volatile(EMMS:::"memory");
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
476 #else
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
477 unsigned j,i,num_pixels=src_size/3;
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
478 uint16_t *d = (uint16_t *)dst;
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
479 for(i=0,j=0; j<num_pixels; i+=3,j++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
480 {
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
481 const int b= src[i+0];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
482 const int g= src[i+1];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
483 const int r= src[i+2];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
484
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
485 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
486 }
2740
1583214489a2 optimized rgb24to16 stuff
nick
parents: 2738
diff changeset
487 #endif
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
488 }
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
489
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
490 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
491 {
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
492 #ifdef HAVE_MMX
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
493 const uint8_t *s = src;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
494 const uint8_t *end,*mm_end;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
495 uint16_t *d = (uint16_t *)dst;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
496 end = s + src_size;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
497 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
498 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
499 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
500 "movq %0, %%mm7\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
501 "movq %1, %%mm6\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
502 ::"m"(red_15mask),"m"(green_15mask));
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
503 if(mm_end == end) mm_end -= MMREG_SIZE*2;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
504 while(s < mm_end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
505 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
506 __asm __volatile(
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
507 PREFETCH" 32%1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
508 "movd %1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
509 "movd 3%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
510 "punpckldq 6%1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
511 "punpckldq 9%1, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
512 "movq %%mm0, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
513 "movq %%mm0, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
514 "movq %%mm3, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
515 "movq %%mm3, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
516 "psrlq $3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
517 "psrlq $3, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
518 "pand %2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
519 "pand %2, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
520 "psrlq $6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
521 "psrlq $6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
522 "pand %%mm6, %%mm1\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
523 "pand %%mm6, %%mm4\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
524 "psrlq $9, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
525 "psrlq $9, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
526 "pand %%mm7, %%mm2\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
527 "pand %%mm7, %%mm5\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
528 "por %%mm1, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
529 "por %%mm4, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
530 "por %%mm2, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
531 "por %%mm5, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
532 "psllq $16, %%mm3\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
533 "por %%mm3, %%mm0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
534 MOVNTQ" %%mm0, %0\n\t"
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
535 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
536 d += 4;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
537 s += 12;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
538 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
539 while(s < end)
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
540 {
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
541 const int b= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
542 const int g= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
543 const int r= *s++;
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
544 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
545 }
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
546 __asm __volatile(SFENCE:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
547 __asm __volatile(EMMS:::"memory");
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
548 #else
2719
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
549 unsigned j,i,num_pixels=src_size/3;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
550 uint16_t *d = (uint16_t *)dst;
fafa73d6d80c Fixed rgb32(24)to16 stuff, rgb32(24)to15 is still broken
nick
parents: 2718
diff changeset
551 for(i=0,j=0; j<num_pixels; i+=3,j++)
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
552 {
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
553 const int b= src[i+0];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
554 const int g= src[i+1];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
555 const int r= src[i+2];
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
556
2720
4ba64e254042 Fixed rgb32(24)to15 stuff
nick
parents: 2719
diff changeset
557 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
558 }
2741
b8a692c59b64 MMX2, 3DNOW, MMX optimized rgb32(24)to16(15) stuff
nick
parents: 2740
diff changeset
559 #endif
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
560 }
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
561
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
562 /**
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
563 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
564 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
565 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
566 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
567 unsigned i;
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
568 for(i=0; i<num_pixels; i++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
569 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
570 }
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
571
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
572 /**
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
573 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
574 */
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
575 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
576 {
2718
9c5e64493742 Well - old algorithms and new stuff rgb24to16(15)
nick
parents: 2711
diff changeset
577 unsigned i;
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
578 for(i=0; i<num_pixels; i++)
2694
2924350d92ed bgr32to16, bgr32to15 (needed for palette stuff)
michael
parents: 2677
diff changeset
579 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
2697
1eaf3f89e49f palette to bgr24
michael
parents: 2694
diff changeset
580 }
2755
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
581
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
582 void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
583 {
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
584 int num_pixels= src_size >> 2;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
585 #ifdef HAVE_MMX
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
586 asm volatile (
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
587 "xorl %%eax, %%eax \n\t"
2800
7847d6b7ad3d .balign or weĦ­ll align by 64kb on some architectures
michael
parents: 2799
diff changeset
588 ".balign 16 \n\t"
2755
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
589 "1: \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
590 PREFETCH" 32(%0, %%eax) \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
591 "movq (%0, %%eax), %%mm0 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
592 "movq %%mm0, %%mm1 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
593 "movq %%mm0, %%mm2 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
594 "pslld $16, %%mm0 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
595 "psrld $16, %%mm1 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
596 "pand mask32r, %%mm0 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
597 "pand mask32g, %%mm2 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
598 "pand mask32b, %%mm1 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
599 "por %%mm0, %%mm2 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
600 "por %%mm1, %%mm2 \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
601 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
602 "addl $2, %%eax \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
603 "cmpl %2, %%eax \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
604 " jb 1b \n\t"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
605 :: "r" (src), "r"(dst), "r" (num_pixels)
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
606 : "%eax"
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
607 );
2766
michael
parents: 2755
diff changeset
608
michael
parents: 2755
diff changeset
609 __asm __volatile(SFENCE:::"memory");
michael
parents: 2755
diff changeset
610 __asm __volatile(EMMS:::"memory");
2755
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
611 #else
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
612 int i;
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
613 for(i=0; i<num_pixels; i++)
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
614 {
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
615 dst[4*i + 0] = src[4*i + 2];
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
616 dst[4*i + 1] = src[4*i + 1];
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
617 dst[4*i + 2] = src[4*i + 0];
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
618 }
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
619 #endif
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
620 }
2f93f4351765 rgb32tobgr32 / bgr32torgb32
michael
parents: 2746
diff changeset
621
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
622 /**
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
623 *
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
624 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
625 * problem for anyone then tell me, and ill fix it)
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
626 */
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
627 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
628 unsigned int width, unsigned int height,
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
629 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
630 {
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
631 int y;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
632 const int chromWidth= width>>1;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
633 for(y=0; y<height; y++)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
634 {
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
635 #ifdef HAVE_MMX
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
636 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
637 asm volatile(
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
638 "xorl %%eax, %%eax \n\t"
2800
7847d6b7ad3d .balign or weĦ­ll align by 64kb on some architectures
michael
parents: 2799
diff changeset
639 ".balign 16 \n\t"
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
640 "1: \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
641 PREFETCH" 32(%1, %%eax, 2) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
642 PREFETCH" 32(%2, %%eax) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
643 PREFETCH" 32(%3, %%eax) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
644 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
645 "movq %%mm0, %%mm2 \n\t" // U(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
646 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
647 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
648 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
649
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
650 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
651 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
652 "movq %%mm3, %%mm4 \n\t" // Y(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
653 "movq %%mm5, %%mm6 \n\t" // Y(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
654 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
655 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
656 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
657 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
658
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
659 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
660 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
661 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
662 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
663
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
664 "addl $8, %%eax \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
665 "cmpl %4, %%eax \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
666 " jb 1b \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
667 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
668 : "%eax"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
669 );
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
670 #else
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
671 int i;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
672 for(i=0; i<chromWidth; i++)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
673 {
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
674 dst[4*i+0] = ysrc[2*i+0];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
675 dst[4*i+1] = usrc[i];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
676 dst[4*i+2] = ysrc[2*i+1];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
677 dst[4*i+3] = vsrc[i];
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
678 }
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
679 #endif
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
680 if(y&1)
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
681 {
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
682 usrc += chromStride;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
683 vsrc += chromStride;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
684 }
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
685 ysrc += lumStride;
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
686 dst += dstStride;
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
687 }
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
688 #ifdef HAVE_MMX
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
689 asm( EMMS" \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
690 SFENCE" \n\t"
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
691 :::"memory");
2702
440312d953a8 yv12toyuy2 in MMX
michael
parents: 2701
diff changeset
692 #endif
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
693 }
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
694
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
695 /**
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
696 *
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
697 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
698 * problem for anyone then tell me, and ill fix it)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
699 */
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
700 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
701 unsigned int width, unsigned int height,
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
702 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
703 {
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
704 int y;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
705 const int chromWidth= width>>1;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
706 for(y=0; y<height; y+=2)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
707 {
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
708 #ifdef HAVE_MMX
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
709 asm volatile(
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
710 "xorl %%eax, %%eax \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
711 "pcmpeqw %%mm7, %%mm7 \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
712 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2800
7847d6b7ad3d .balign or weĦ­ll align by 64kb on some architectures
michael
parents: 2799
diff changeset
713 ".balign 16 \n\t"
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
714 "1: \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
715 PREFETCH" 64(%0, %%eax, 4) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
716 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
717 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
718 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
719 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
720 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
721 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
722 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
723 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
724 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
725 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
726
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
727 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
728
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
729 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
730 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
731 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
732 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
733 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
734 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
735 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
736 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
737 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
738 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
739
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
740 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
741
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
742 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
743 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
744 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
745 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
746 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
747 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
748 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
749 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
750
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
751 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
752 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
753
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
754 "addl $8, %%eax \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
755 "cmpl %4, %%eax \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
756 " jb 1b \n\t"
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
757 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
758 : "memory", "%eax"
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
759 );
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
760
2806
cbb62e07bc0e yuy2toyv12 bugfix
michael
parents: 2801
diff changeset
761 ydst += lumStride;
cbb62e07bc0e yuy2toyv12 bugfix
michael
parents: 2801
diff changeset
762 src += srcStride;
cbb62e07bc0e yuy2toyv12 bugfix
michael
parents: 2801
diff changeset
763
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
764 asm volatile(
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
765 "xorl %%eax, %%eax \n\t"
2800
7847d6b7ad3d .balign or weĦ­ll align by 64kb on some architectures
michael
parents: 2799
diff changeset
766 ".balign 16 \n\t"
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
767 "1: \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
768 PREFETCH" 64(%0, %%eax, 4) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
769 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
770 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
771 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
772 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
773 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
774 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
775 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
776 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
777 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
778 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
779
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
780 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
781 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
782
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
783 "addl $8, %%eax \n\t"
2725
5bba527c9a4c unsigned stuff
michael
parents: 2724
diff changeset
784 "cmpl %4, %%eax \n\t"
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
785 " jb 1b \n\t"
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
786
2806
cbb62e07bc0e yuy2toyv12 bugfix
michael
parents: 2801
diff changeset
787 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
788 : "memory", "%eax"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
789 );
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
790 #else
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
791 int i;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
792 for(i=0; i<chromWidth; i++)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
793 {
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
794 ydst[2*i+0] = src[4*i+0];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
795 udst[i] = src[4*i+1];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
796 ydst[2*i+1] = src[4*i+2];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
797 vdst[i] = src[4*i+3];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
798 }
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
799 ydst += lumStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
800 src += srcStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
801
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
802 for(i=0; i<chromWidth; i++)
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
803 {
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
804 ydst[2*i+0] = src[4*i+0];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
805 ydst[2*i+1] = src[4*i+2];
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
806 }
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
807 #endif
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
808 udst += chromStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
809 vdst += chromStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
810 ydst += lumStride;
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
811 src += srcStride;
2701
9b47bc409083 yv12 <-> yuy2 in C
michael
parents: 2698
diff changeset
812 }
2724
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
813 #ifdef HAVE_MMX
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
814 asm( EMMS" \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
815 SFENCE" \n\t"
c08b7af26782 yuy2toyv12 fixed and speedup
michael
parents: 2723
diff changeset
816 :::"memory");
2704
b4c6699d3893 yuy2toyv12 in MMX
michael
parents: 2702
diff changeset
817 #endif
2723
22aba8af94af fixed yv12toyuy2
michael
parents: 2720
diff changeset
818 }
2801
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
819
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
820 /**
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
821 *
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
822 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
823 * problem for anyone then tell me, and ill fix it)
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
824 */
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
825 void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
826 unsigned int width, unsigned int height,
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
827 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
828 {
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
829 int y;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
830 const int chromWidth= width>>1;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
831 for(y=0; y<height; y+=2)
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
832 {
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
833 int i;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
834 for(i=0; i<chromWidth; i++)
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
835 {
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
836 udst[i] = src[4*i+0];
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
837 ydst[2*i+0] = src[4*i+1];
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
838 vdst[i] = src[4*i+2];
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
839 ydst[2*i+1] = src[4*i+3];
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
840 }
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
841 ydst += lumStride;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
842 src += srcStride;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
843
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
844 for(i=0; i<chromWidth; i++)
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
845 {
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
846 ydst[2*i+0] = src[4*i+1];
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
847 ydst[2*i+1] = src[4*i+3];
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
848 }
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
849 udst += chromStride;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
850 vdst += chromStride;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
851 ydst += lumStride;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
852 src += srcStride;
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
853 }
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
854 }
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
855
318c240363c7 uyvy->uv12 added
arpi
parents: 2800
diff changeset
856