# HG changeset patch # User michaelni # Date 1053041403 0 # Node ID 85b71f9f7450b668d9b9a1f02a6c2710064cd599 # Parent ec946cb7439728e8dfa30121e4cd2757367ce19c moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...) diff -r ec946cb74397 -r 85b71f9f7450 dsputil.c --- a/dsputil.c Thu May 15 01:34:47 2003 +0000 +++ b/dsputil.c Thu May 15 23:30:03 2003 +0000 @@ -466,6 +466,14 @@ #else // 64 bit variant #define PIXOP2(OPNAME, OP) \ +static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + int i;\ + for(i=0; i>2)&0x0F0F0F0FUL));\ }\ }\ +\ +static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ +}\ +\ static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ int i;\ @@ -635,6 +671,75 @@ OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ }\ \ +static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i, a0, b0, a1, b1;\ + a0= pixels[0];\ + b0= pixels[1] + 2;\ + a0 += b0;\ + b0 += pixels[2];\ +\ + pixels+=line_size;\ + for(i=0; i>2; /* FIXME non put */\ + block[1]= (b1+b0)>>2;\ +\ + pixels+=line_size;\ + block +=line_size;\ +\ + a0= pixels[0];\ + b0= pixels[1] + 2;\ + a0 += b0;\ + b0 += pixels[2];\ +\ + block[0]= (a1+a0)>>2;\ + block[1]= (b1+b0)>>2;\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD32(pixels );\ + b= LD32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ {\ int j;\ @@ -819,6 +924,125 @@ oy += dyy; } } + +static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + switch(width){ + case 2: put_pixels2_c (dst, src, stride, height); break; + case 4: put_pixels4_c (dst, src, stride, height); break; + case 8: put_pixels8_c (dst, src, stride, height); break; + case 16:put_pixels16_c(dst, src, stride, height); break; + } +} + +static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; + } + src += stride; + dst += stride; + } +} + +static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ + int i,j; + for (i=0; i < height; i++) { + for (j=0; j < width; j++) { + dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15; + } + src += stride; + dst += stride; + } +} +#if 0 +#define TPEL_WIDTH(width)\ +static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\ +static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ + void put_tpel_pixels_mc22_c(dst, src, stride, width, height);} +#endif + #define H264_CHROMA_MC(OPNAME, OP)\ static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ const int A=(8-x)*(8-y);\ @@ -2561,6 +2785,8 @@ dspfunc(put_no_rnd, 0, 16); dspfunc(put, 1, 8); dspfunc(put_no_rnd, 1, 8); + dspfunc(put, 2, 4); + dspfunc(put, 3, 2); dspfunc(avg, 0, 16); dspfunc(avg_no_rnd, 0, 16); @@ -2568,6 +2794,16 @@ dspfunc(avg_no_rnd, 1, 8); #undef dspfunc + c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c; + c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c; + c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c; + c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c; + c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c; + c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c; + c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c; + c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c; + c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c; + #define dspfunc(PFX, IDX, NUM) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ @@ -2621,7 +2857,7 @@ c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c; c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c; c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c; - + c->hadamard8_diff[0]= hadamard8_diff16_c; c->hadamard8_diff[1]= hadamard8_diff_c; c->hadamard8_abs = hadamard8_abs_c; diff -r ec946cb74397 -r 85b71f9f7450 dsputil.h --- a/dsputil.h Thu May 15 01:34:47 2003 +0000 +++ b/dsputil.h Thu May 15 23:30:03 2003 +0000 @@ -77,6 +77,7 @@ /* add and put pixel (decoding) */ // blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16 typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h); +typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h); typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); @@ -146,18 +147,18 @@ me_cmp_func me_sub_cmp[11]; me_cmp_func mb_cmp[11]; - /* maybe create an array for 16/8 functions */ + /* maybe create an array for 16/8/4/2 functions */ /** * Halfpel motion compensation with rounding (a+b+1)>>1. - * this is an array[2][4] of motion compensation funcions for 2 - * horizontal blocksizes (8,16) and the 4 halfpel positions
+ * this is an array[4][4] of motion compensation funcions for 4 + * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions
* *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] * @param block destination where the result is stored * @param pixels source * @param line_size number of bytes in a horizontal line of block * @param h height */ - op_pixels_func put_pixels_tab[2][4]; + op_pixels_func put_pixels_tab[4][4]; /** * Halfpel motion compensation with rounding (a+b+1)>>1. @@ -194,6 +195,18 @@ * @param h height */ op_pixels_func avg_no_rnd_pixels_tab[2][4]; + + /** + * Thirdpel motion compensation with rounding (a+b+1)>>1. + * this is an array[12] of motion compensation funcions for the 9 thirdpel positions
+ * *pixels_tab[ xthirdpel + 4*ythirdpel ] + * @param block destination where the result is stored + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width? + qpel_mc_func put_qpel_pixels_tab[2][16]; qpel_mc_func avg_qpel_pixels_tab[2][16]; qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; @@ -380,7 +393,9 @@ struct unaligned_64 { uint64_t l; } __attribute__((packed)); struct unaligned_32 { uint32_t l; } __attribute__((packed)); +struct unaligned_16 { uint16_t l; } __attribute__((packed)); +#define LD16(a) (((const struct unaligned_16 *) (a))->l) #define LD32(a) (((const struct unaligned_32 *) (a))->l) #define LD64(a) (((const struct unaligned_64 *) (a))->l) @@ -388,6 +403,7 @@ #else /* __GNUC__ */ +#define LD16(a) (*((uint16_t*)(a))) #define LD32(a) (*((uint32_t*)(a))) #define LD64(a) (*((uint64_t*)(a))) diff -r ec946cb74397 -r 85b71f9f7450 svq3.c --- a/svq3.c Thu May 15 01:34:47 2003 +0000 +++ b/svq3.c Thu May 15 23:30:03 2003 +0000 @@ -262,125 +262,11 @@ return 0; } -static void sixpel_mc_put (MpegEncContext *s, - uint8_t *src, uint8_t *dst, int stride, - int dxy, int width, int height) { - int i, j; - - switch (dxy) { - case 6*0+0: - for (i=0; i < height; i++) { - memcpy (dst, src, width); - src += stride; - dst += stride; - } - break; - case 6*0+2: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11; - } - src += stride; - dst += stride; - } - break; - case 6*0+3: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (src[j] + src[j+1] + 1) >> 1; - } - src += stride; - dst += stride; - } - break; - case 6*0+4: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11; - } - src += stride; - dst += stride; - } - break; - case 6*2+0: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11; - } - src += stride; - dst += stride; - } - break; - case 6*2+2: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15; - } - src += stride; - dst += stride; - } - break; - case 6*2+4: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; - } - src += stride; - dst += stride; - } - break; - case 6*3+0: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (src[j] + src[j+stride]+1) >> 1; - } - src += stride; - dst += stride; - } - break; - case 6*3+3: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (src[j] + src[j+1] + src[j+stride] + src[j+stride+1] + 2) >> 2; - } - src += stride; - dst += stride; - } - break; - case 6*4+0: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11; - } - src += stride; - dst += stride; - } - break; - case 6*4+2: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; - } - src += stride; - dst += stride; - } - break; - case 6*4+4: - for (i=0; i < height; i++) { - for (j=0; j < width; j++) { - dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15; - } - src += stride; - dst += stride; - } - break; - } -} - static inline void svq3_mc_dir_part (MpegEncContext *s, int x, int y, - int width, int height, int mx, int my, int dxy) { + int width, int height, int mx, int my, int dxy, int thirdpel) { uint8_t *src, *dest; int i, emu = 0; + int blocksize= 2 - (width>>3); //16->0, 8->1, 4->2 mx += x; my += y; @@ -405,13 +291,17 @@ mx, my, s->width, s->height); src = s->edge_emu_buffer; } - sixpel_mc_put (s, src, dest, s->linesize, dxy, width, height); + if(thirdpel) + s->dsp.put_tpel_pixels_tab[dxy](dest, src, s->linesize, width, height); + else + s->dsp.put_pixels_tab[blocksize][dxy](dest, src, s->linesize, height); if (!(s->flags & CODEC_FLAG_GRAY)) { mx = (mx + (mx < (int) x)) >> 1; my = (my + (my < (int) y)) >> 1; width = (width >> 1); height = (height >> 1); + blocksize++; for (i=1; i < 3; i++) { dest = s->current_picture.data[i] + (x >> 1) + (y >> 1)*s->uvlinesize; @@ -422,7 +312,10 @@ mx, my, (s->width >> 1), (s->height >> 1)); src = s->edge_emu_buffer; } - sixpel_mc_put (s, src, dest, s->uvlinesize, dxy, width, height); + if(thirdpel) + s->dsp.put_tpel_pixels_tab[dxy](dest, src, s->uvlinesize, width, height); + else + s->dsp.put_pixels_tab[blocksize][dxy](dest, src, s->uvlinesize, height); } } } @@ -441,7 +334,7 @@ h->topright_samples_available = 0xFFFF; if (mb_type == 0) { /* SKIP */ - svq3_mc_dir_part (s, 16*s->mb_x, 16*s->mb_y, 16, 16, 0, 0, 0); + svq3_mc_dir_part (s, 16*s->mb_x, 16*s->mb_y, 16, 16, 0, 0, 0, 0); cbp = 0; mb_type = MB_TYPE_SKIP; @@ -521,17 +414,17 @@ my = ((my + 1)>>1) + dy; fx= ((unsigned)(mx + 0x3000))/3 - 0x1000; fy= ((unsigned)(my + 0x3000))/3 - 0x1000; - dxy= 2*(mx - 3*fx) + 2*6*(my - 3*fy); + dxy= (mx - 3*fx) + 4*(my - 3*fy); - svq3_mc_dir_part (s, x, y, part_width, part_height, fx, fy, dxy); + svq3_mc_dir_part (s, x, y, part_width, part_height, fx, fy, dxy, 1); mx += mx; my += my; } else if (mode == HALFPEL_MODE) { mx = ((unsigned)(mx + 1 + 0x3000))/3 + dx - 0x1000; my = ((unsigned)(my + 1 + 0x3000))/3 + dy - 0x1000; - dxy= 3*(mx&1) + 6*3*(my&1); + dxy= (mx&1) + 2*(my&1); - svq3_mc_dir_part (s, x, y, part_width, part_height, mx>>1, my>>1, dxy); + svq3_mc_dir_part (s, x, y, part_width, part_height, mx>>1, my>>1, dxy, 0); mx *= 3; my *= 3; } else { @@ -539,7 +432,7 @@ mx = ((unsigned)(mx + 3 + 0x6000))/6 + dx - 0x1000; my = ((unsigned)(my + 3 + 0x6000))/6 + dy - 0x1000; - svq3_mc_dir_part (s, x, y, part_width, part_height, mx, my, 0); + svq3_mc_dir_part (s, x, y, part_width, part_height, mx, my, 0, 0); mx *= 6; my *= 6; }