Mercurial > libavcodec.hg

--- a/armv4l/dsputil_iwmmxt.c	Mon Oct 27 00:25:19 2008 +0000
+++ b/armv4l/dsputil_iwmmxt.c	Mon Oct 27 14:35:58 2008 +0000
@@ -24,7 +24,7 @@
 #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
 #define SET_RND(regd)  __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
 #define WAVG2B "wavg2b"
-#include "dsputil_iwmmxt_rnd.h"
+#include "dsputil_iwmmxt_rnd_template.c"
 #undef DEF
 #undef SET_RND
 #undef WAVG2B
@@ -32,7 +32,7 @@
 #define DEF(x, y) x ## _ ## y ##_iwmmxt
 #define SET_RND(regd)  __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
 #define WAVG2B "wavg2br"
-#include "dsputil_iwmmxt_rnd.h"
+#include "dsputil_iwmmxt_rnd_template.c"
 #undef DEF
 #undef SET_RND
 #undef WAVG2BR
--- a/armv4l/dsputil_iwmmxt_rnd.h	Mon Oct 27 00:25:19 2008 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1118 +0,0 @@
-/*
- * iWMMXt optimized DSP utils
- * copyright (c) 2004 AGAWA Koji
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "wldrd wr0, [%[block]] \n\t"
-        "wldrd wr2, [r5] \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        WAVG2B" wr8, wr8, wr0 \n\t"
-        WAVG2B" wr10, wr10, wr2 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "pld [%[block]] \n\t"
-        "pld [%[block], #32] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "pld [r5] \n\t"
-        "pld [r5, #32] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr2, [%[pixels], #16] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "walignr1 wr9, wr1, wr2 \n\t"
-        "wldrd wr5, [r4, #16] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "walignr1 wr11, wr4, wr5 \n\t"
-        "wstrd wr9, [%[block], #8] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "wstrd wr11, [r5, #8] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1:                             \n\t"
-        "wldrd wr0, [%[pixels]]         \n\t"
-        "wldrd wr1, [%[pixels], #8]     \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wldrd wr2, [%[pixels], #16]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4]                \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr8, wr0, wr1         \n\t"
-        "wldrd wr4, [r4, #8]            \n\t"
-        "walignr1 wr9, wr1, wr2         \n\t"
-        "wldrd wr5, [r4, #16]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "wldrd wr0, [%[block]]          \n\t"
-        "pld [r4]                       \n\t"
-        "wldrd wr1, [%[block], #8]      \n\t"
-        "pld [r4, #32]                  \n\t"
-        "wldrd wr2, [r5]                \n\t"
-        "walignr1 wr10, wr3, wr4        \n\t"
-        "wldrd wr3, [r5, #8]            \n\t"
-        WAVG2B" wr8, wr8, wr0           \n\t"
-        WAVG2B" wr9, wr9, wr1           \n\t"
-        WAVG2B" wr10, wr10, wr2         \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "walignr1 wr11, wr4, wr5        \n\t"
-        WAVG2B" wr11, wr11, wr3         \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5]               \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "wstrd wr11, [r5, #8]           \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wldrd wr15, [r4, #16]          \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "walignr1 wr3, wr14, wr15       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr5, wr12               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "wmoveq wr7, wr15               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr5, wr11, wr12     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "walignr2ne wr7, wr14, wr15     \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr1, wr1, wr5           \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wstrd wr1, [%[block], #8]      \n\t"
-        WAVG2B" wr3, wr3, wr7           \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr3, [r5, #8]            \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "wldrd wr12, [r5]               \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        WAVG2B" wr0, wr0, wr10          \n\t"
-        WAVG2B" wr2, wr2, wr12          \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wldrd wr15, [r4, #16]          \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "walignr1 wr3, wr14, wr15       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr5, wr12               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "wmoveq wr7, wr15               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr5, wr11, wr12     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "walignr2ne wr7, wr14, wr15     \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr1, wr1, wr5           \n\t"
-        "wldrd wr12, [r5]               \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wldrd wr13, [r5, #8]           \n\t"
-        WAVG2B" wr3, wr3, wr7           \n\t"
-        WAVG2B" wr0, wr0, wr10          \n\t"
-        WAVG2B" wr1, wr1, wr11          \n\t"
-        WAVG2B" wr2, wr2, wr12          \n\t"
-        WAVG2B" wr3, wr3, wr13          \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr1, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "pld [%[block]]                 \n\t"
-        "wstrd wr3, [r5, #8]            \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        :"r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "and            r12, %[pixels], #7                      \n\t"
-        "tmcr           wcgr1, r12                              \n\t"
-        "bic            %[pixels], %[pixels], #7                \n\t"
-
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "pld            [%[block]]                              \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "walignr1       wr0, wr10, wr11                         \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-
-      "1:                                                       \n\t"
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "walignr1       wr4, wr10, wr11                         \n\t"
-        "wldrd          wr10, [%[block]]                        \n\t"
-         WAVG2B"        wr8, wr0, wr4                           \n\t"
-         WAVG2B"        wr8, wr8, wr10                          \n\t"
-        "wstrd          wr8, [%[block]]                         \n\t"
-        "add            %[block], %[block], %[line_size]        \n\t"
-
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "pld            [%[block]]                              \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "walignr1       wr0, wr10, wr11                         \n\t"
-        "wldrd          wr10, [%[block]]                        \n\t"
-         WAVG2B"        wr8, wr0, wr4                           \n\t"
-         WAVG2B"        wr8, wr8, wr10                          \n\t"
-        "wstrd          wr8, [%[block]]                         \n\t"
-        "add            %[block], %[block], %[line_size]        \n\t"
-
-        "subs           %[h], %[h], #2                          \n\t"
-        "pld            [%[block]]                              \n\t"
-        "bne            1b                                      \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "cc", "memory", "r12");
-}
-
-void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr4, wr10, wr11       \n\t"
-        "walignr1 wr5, wr11, wr12       \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr4, wr10, wr11       \n\t"
-        "walignr1 wr5, wr11, wr12       \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        WAVG2B" wr8, wr8, wr10          \n\t"
-        WAVG2B" wr9, wr9, wr11          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        WAVG2B" wr8, wr8, wr10          \n\t"
-        WAVG2B" wr9, wr9, wr11          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "pld [%[block]]                 \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "add r12, r12, #1               \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "cmp r12, #8                    \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        /* alignment */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "tmcr wcgr2, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr7, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr6, wr7            \n\t"
-        "wunpckehub wr7, wr7            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr6, wr6, wr10         \n\t"
-        "waddhus wr7, wr7, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "add r12, r12, #1               \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "cmp r12, #8                    \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        /* alignment */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7           \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "tmcr wcgr2, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr7, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr6, wr7            \n\t"
-        "wunpckehub wr7, wr7            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr6, wr6, wr10         \n\t"
-        "waddhus wr7, wr7, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wldrd wr13, [%[block], #8]     \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        WAVG2B" wr9, wr9, wr13          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "pld [%[block], #32]            \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wldrd wr13, [%[block], #8]     \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        WAVG2B" wr9, wr9, wr13          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/armv4l/dsputil_iwmmxt_rnd_template.c	Mon Oct 27 14:35:58 2008 +0000
@@ -0,0 +1,1118 @@
+/*
+ * iWMMXt optimized DSP utils
+ * copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* This header intentionally has no multiple inclusion guards. It is meant to
+ * be included multiple times and generates different code depending on the
+ * value of certain #defines. */
+
+void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "wldrd wr0, [%[block]] \n\t"
+        "wldrd wr2, [r5] \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        WAVG2B" wr8, wr8, wr0 \n\t"
+        WAVG2B" wr10, wr10, wr2 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "pld [%[block]] \n\t"
+        "pld [%[block], #32] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "pld [r5] \n\t"
+        "pld [r5, #32] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr2, [%[pixels], #16] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "walignr1 wr9, wr1, wr2 \n\t"
+        "wldrd wr5, [r4, #16] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "walignr1 wr11, wr4, wr5 \n\t"
+        "wstrd wr9, [%[block], #8] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "wstrd wr11, [r5, #8] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1:                             \n\t"
+        "wldrd wr0, [%[pixels]]         \n\t"
+        "wldrd wr1, [%[pixels], #8]     \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wldrd wr2, [%[pixels], #16]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4]                \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr8, wr0, wr1         \n\t"
+        "wldrd wr4, [r4, #8]            \n\t"
+        "walignr1 wr9, wr1, wr2         \n\t"
+        "wldrd wr5, [r4, #16]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "wldrd wr0, [%[block]]          \n\t"
+        "pld [r4]                       \n\t"
+        "wldrd wr1, [%[block], #8]      \n\t"
+        "pld [r4, #32]                  \n\t"
+        "wldrd wr2, [r5]                \n\t"
+        "walignr1 wr10, wr3, wr4        \n\t"
+        "wldrd wr3, [r5, #8]            \n\t"
+        WAVG2B" wr8, wr8, wr0           \n\t"
+        WAVG2B" wr9, wr9, wr1           \n\t"
+        WAVG2B" wr10, wr10, wr2         \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "walignr1 wr11, wr4, wr5        \n\t"
+        WAVG2B" wr11, wr11, wr3         \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5]               \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "wstrd wr11, [r5, #8]           \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wldrd wr15, [r4, #16]          \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "walignr1 wr3, wr14, wr15       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr5, wr12               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "wmoveq wr7, wr15               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr5, wr11, wr12     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "walignr2ne wr7, wr14, wr15     \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr1, wr1, wr5           \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wstrd wr1, [%[block], #8]      \n\t"
+        WAVG2B" wr3, wr3, wr7           \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr3, [r5, #8]            \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "wldrd wr12, [r5]               \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        WAVG2B" wr0, wr0, wr10          \n\t"
+        WAVG2B" wr2, wr2, wr12          \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wldrd wr15, [r4, #16]          \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "walignr1 wr3, wr14, wr15       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr5, wr12               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "wmoveq wr7, wr15               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr5, wr11, wr12     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "walignr2ne wr7, wr14, wr15     \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr1, wr1, wr5           \n\t"
+        "wldrd wr12, [r5]               \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wldrd wr13, [r5, #8]           \n\t"
+        WAVG2B" wr3, wr3, wr7           \n\t"
+        WAVG2B" wr0, wr0, wr10          \n\t"
+        WAVG2B" wr1, wr1, wr11          \n\t"
+        WAVG2B" wr2, wr2, wr12          \n\t"
+        WAVG2B" wr3, wr3, wr13          \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr1, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "pld [%[block]]                 \n\t"
+        "wstrd wr3, [r5, #8]            \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        :"r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "and            r12, %[pixels], #7                      \n\t"
+        "tmcr           wcgr1, r12                              \n\t"
+        "bic            %[pixels], %[pixels], #7                \n\t"
+
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "pld            [%[block]]                              \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "walignr1       wr0, wr10, wr11                         \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+
+      "1:                                                       \n\t"
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "walignr1       wr4, wr10, wr11                         \n\t"
+        "wldrd          wr10, [%[block]]                        \n\t"
+         WAVG2B"        wr8, wr0, wr4                           \n\t"
+         WAVG2B"        wr8, wr8, wr10                          \n\t"
+        "wstrd          wr8, [%[block]]                         \n\t"
+        "add            %[block], %[block], %[line_size]        \n\t"
+
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "pld            [%[block]]                              \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "walignr1       wr0, wr10, wr11                         \n\t"
+        "wldrd          wr10, [%[block]]                        \n\t"
+         WAVG2B"        wr8, wr0, wr4                           \n\t"
+         WAVG2B"        wr8, wr8, wr10                          \n\t"
+        "wstrd          wr8, [%[block]]                         \n\t"
+        "add            %[block], %[block], %[line_size]        \n\t"
+
+        "subs           %[h], %[h], #2                          \n\t"
+        "pld            [%[block]]                              \n\t"
+        "bne            1b                                      \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "cc", "memory", "r12");
+}
+
+void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr4, wr10, wr11       \n\t"
+        "walignr1 wr5, wr11, wr12       \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr4, wr10, wr11       \n\t"
+        "walignr1 wr5, wr11, wr12       \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        WAVG2B" wr8, wr8, wr10          \n\t"
+        WAVG2B" wr9, wr9, wr11          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        WAVG2B" wr8, wr8, wr10          \n\t"
+        WAVG2B" wr9, wr9, wr11          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "pld [%[block]]                 \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "add r12, r12, #1               \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "cmp r12, #8                    \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        /* alignment */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "tmcr wcgr2, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr7, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr6, wr7            \n\t"
+        "wunpckehub wr7, wr7            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr6, wr6, wr10         \n\t"
+        "waddhus wr7, wr7, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "add r12, r12, #1               \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "cmp r12, #8                    \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        /* alignment */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7           \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "tmcr wcgr2, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr7, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr6, wr7            \n\t"
+        "wunpckehub wr7, wr7            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr6, wr6, wr10         \n\t"
+        "waddhus wr7, wr7, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wldrd wr13, [%[block], #8]     \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        WAVG2B" wr9, wr9, wr13          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "pld [%[block], #32]            \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wldrd wr13, [%[block], #8]     \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        WAVG2B" wr9, wr9, wr13          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
--- a/i386/dsputil_mmx.c	Mon Oct 27 00:25:19 2008 +0000
+++ b/i386/dsputil_mmx.c	Mon Oct 27 14:35:58 2008 +0000
@@ -155,7 +155,7 @@
 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)

-#include "dsputil_mmx_rnd.h"
+#include "dsputil_mmx_rnd_template.c"

 #undef DEF
 #undef SET_RND
@@ -169,7 +169,7 @@
 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)

-#include "dsputil_mmx_rnd.h"
+#include "dsputil_mmx_rnd_template.c"

 #undef DEF
 #undef SET_RND
@@ -182,7 +182,7 @@
 #define DEF(x) x ## _3dnow
 #define PAVGB "pavgusb"

-#include "dsputil_mmx_avg.h"
+#include "dsputil_mmx_avg_template.c"

 #undef DEF
 #undef PAVGB
@@ -195,7 +195,7 @@
 /* Introduced only in MMX2 set */
 #define PAVGB "pavgb"

-#include "dsputil_mmx_avg.h"
+#include "dsputil_mmx_avg_template.c"

 #undef DEF
 #undef PAVGB
--- a/i386/dsputil_mmx_avg.h	Mon Oct 27 00:25:19 2008 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,900 +0,0 @@
-/*
- * DSP utils : average functions are compiled twice for 3dnow/mmx2
- * Copyright (c) 2000, 2001 Fabrice Bellard.
- * Copyright (c) 2002-2004 Michael Niedermayer
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
- * and improved by Zdenek Kabelac <kabi@users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
-   clobber bug - now it will work with 2.95.2 and also with -fPIC
- */
-static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "movd   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $4, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "movd   (%2), %%mm2             \n\t"
-        "movd   4(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "movd   8(%2), %%mm2            \n\t"
-        "movd   12(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $16, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-
-static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   16(%2), %%mm2           \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "movd   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $4, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 4(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 8(%2), %%mm0             \n\t"
-        PAVGB" 12(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $16, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-
-static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq 8(%1), %%mm2              \n\t"
-        "movq 8(%1, %3), %%mm3          \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 9(%1), %%mm2             \n\t"
-        PAVGB" 9(%1, %3), %%mm3         \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq %%mm2, 8(%2)              \n\t"
-        "movq %%mm3, 8(%2, %3)          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq 8(%1), %%mm2              \n\t"
-        "movq 8(%1, %3), %%mm3          \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 9(%1), %%mm2             \n\t"
-        PAVGB" 9(%1, %3), %%mm3         \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq %%mm2, 8(%2)              \n\t"
-        "movq %%mm3, 8(%2, %3)          \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   16(%2), %%mm2           \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "movq 1(%1), %%mm1              \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm0           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq 1(%1), %%mm1              \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm0           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D" (block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm1           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm1           \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D" (block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm2         \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm2          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm2         \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm2          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq (%2, %3), %%mm3           \n\t"
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
-        PAVGB" %%mm3, %%mm0             \n\t"
-        PAVGB" %%mm4, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq (%2, %3), %%mm3           \n\t"
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        PAVGB" %%mm4, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-/* Note this is not correctly rounded, but this function is only
- * used for B-frames so it does not matter. */
-static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-         ASMALIGN(3)
-        "1:                             \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        PAVGB" (%2), %%mm2              \n\t"
-        PAVGB" (%2, %3), %%mm1          \n\t"
-        "movq %%mm2, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a,  "memory");
-}
-
-static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    do {
-        __asm__ volatile(
-            "movd (%1), %%mm0               \n\t"
-            "movd (%1, %2), %%mm1           \n\t"
-            "movd (%1, %2, 2), %%mm2        \n\t"
-            "movd (%1, %3), %%mm3           \n\t"
-            PAVGB" (%0), %%mm0              \n\t"
-            PAVGB" (%0, %2), %%mm1          \n\t"
-            PAVGB" (%0, %2, 2), %%mm2       \n\t"
-            PAVGB" (%0, %3), %%mm3          \n\t"
-            "movd %%mm0, (%1)               \n\t"
-            "movd %%mm1, (%1, %2)           \n\t"
-            "movd %%mm2, (%1, %2, 2)        \n\t"
-            "movd %%mm3, (%1, %3)           \n\t"
-            ::"S"(pixels), "D"(block),
-             "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-            :"memory");
-        block += 4*line_size;
-        pixels += 4*line_size;
-        h -= 4;
-    } while(h > 0);
-}
-
-//FIXME the following could be optimized too ...
-static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
-    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
-}
-
-#define QPEL_2TAP_L3(OPNAME) \
-static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
-    __asm__ volatile(\
-        "1:                    \n\t"\
-        "movq   (%1,%2), %%mm0 \n\t"\
-        "movq  8(%1,%2), %%mm1 \n\t"\
-        PAVGB"  (%1,%3), %%mm0 \n\t"\
-        PAVGB" 8(%1,%3), %%mm1 \n\t"\
-        PAVGB"  (%1),    %%mm0 \n\t"\
-        PAVGB" 8(%1),    %%mm1 \n\t"\
-        STORE_OP( (%1,%4),%%mm0)\
-        STORE_OP(8(%1,%4),%%mm1)\
-        "movq  %%mm0,  (%1,%4) \n\t"\
-        "movq  %%mm1, 8(%1,%4) \n\t"\
-        "add   %5, %1          \n\t"\
-        "decl  %0              \n\t"\
-        "jnz   1b              \n\t"\
-        :"+g"(h), "+r"(src)\
-        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
-         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
-        :"memory"\
-    );\
-}\
-static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
-    __asm__ volatile(\
-        "1:                    \n\t"\
-        "movq   (%1,%2), %%mm0 \n\t"\
-        PAVGB"  (%1,%3), %%mm0 \n\t"\
-        PAVGB"  (%1),    %%mm0 \n\t"\
-        STORE_OP((%1,%4),%%mm0)\
-        "movq  %%mm0,  (%1,%4) \n\t"\
-        "add   %5, %1          \n\t"\
-        "decl  %0              \n\t"\
-        "jnz   1b              \n\t"\
-        :"+g"(h), "+r"(src)\
-        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
-         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
-        :"memory"\
-    );\
-}
-
-#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
-QPEL_2TAP_L3(avg_)
-#undef STORE_OP
-#define STORE_OP(a,b)
-QPEL_2TAP_L3(put_)
-#undef STORE_OP
-#undef QPEL_2TAP_L3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/i386/dsputil_mmx_avg_template.c	Mon Oct 27 14:35:58 2008 +0000
@@ -0,0 +1,900 @@
+/*
+ * DSP utils : average functions are compiled twice for 3dnow/mmx2
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* This header intentionally has no multiple inclusion guards. It is meant to
+ * be included multiple times and generates different code depending on the
+ * value of certain #defines. */
+
+/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
+   clobber bug - now it will work with 2.95.2 and also with -fPIC
+ */
+static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "movd   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $4, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "movd   (%2), %%mm2             \n\t"
+        "movd   4(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "movd   8(%2), %%mm2            \n\t"
+        "movd   12(%2), %%mm3           \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $16, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "pcmpeqb %%mm6, %%mm6           \n\t"
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%2), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   16(%2), %%mm2           \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "movd   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $4, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 4(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 8(%2), %%mm0             \n\t"
+        PAVGB" 12(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $16, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+
+static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq 8(%1), %%mm2              \n\t"
+        "movq 8(%1, %3), %%mm3          \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 9(%1), %%mm2             \n\t"
+        PAVGB" 9(%1, %3), %%mm3         \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "movq %%mm2, 8(%2)              \n\t"
+        "movq %%mm3, 8(%2, %3)          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq 8(%1), %%mm2              \n\t"
+        "movq 8(%1, %3), %%mm3          \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 9(%1), %%mm2             \n\t"
+        PAVGB" 9(%1, %3), %%mm3         \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "movq %%mm2, 8(%2)              \n\t"
+        "movq %%mm3, 8(%2, %3)          \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "pcmpeqb %%mm6, %%mm6           \n\t"
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "movq   (%2), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%2), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   16(%2), %%mm2           \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BONE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        "movq 1(%1), %%mm1              \n\t"
+        "movq 1(%1, %3), %%mm3          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm0           \n\t"
+        "psubusb %%mm6, %%mm2           \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm2             \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq 1(%1), %%mm1              \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        "movq 1(%1, %3), %%mm3          \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm0           \n\t"
+        "psubusb %%mm6, %%mm2           \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm2             \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "sub %3, %2                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        "movq %%mm0, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D" (block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BONE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "sub %3, %2                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm1           \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        "movq %%mm0, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm1           \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D" (block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%2), %%mm0               \n\t"
+        "movq (%2, %3), %%mm1           \n\t"
+        PAVGB" (%1), %%mm0              \n\t"
+        PAVGB" (%1, %3), %%mm1          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%2), %%mm0               \n\t"
+        "movq (%2, %3), %%mm1           \n\t"
+        PAVGB" (%1), %%mm0              \n\t"
+        PAVGB" (%1, %3), %%mm1          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm2         \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" (%2, %3), %%mm2          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm2         \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" (%2, %3), %%mm2          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "sub %3, %2                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        "movq (%2, %3), %%mm3           \n\t"
+        "movq (%2, %%"REG_a"), %%mm4    \n\t"
+        PAVGB" %%mm3, %%mm0             \n\t"
+        PAVGB" %%mm4, %%mm1             \n\t"
+        "movq %%mm0, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq (%2, %3), %%mm3           \n\t"
+        "movq (%2, %%"REG_a"), %%mm4    \n\t"
+        PAVGB" %%mm3, %%mm2             \n\t"
+        PAVGB" %%mm4, %%mm1             \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+/* Note this is not correctly rounded, but this function is only
+ * used for B-frames so it does not matter. */
+static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BONE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+         ASMALIGN(3)
+        "1:                             \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "psubusb %%mm6, %%mm2           \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" (%2, %3), %%mm1          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        PAVGB" (%2), %%mm2              \n\t"
+        PAVGB" (%2, %3), %%mm1          \n\t"
+        "movq %%mm2, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a,  "memory");
+}
+
+static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    do {
+        __asm__ volatile(
+            "movd (%1), %%mm0               \n\t"
+            "movd (%1, %2), %%mm1           \n\t"
+            "movd (%1, %2, 2), %%mm2        \n\t"
+            "movd (%1, %3), %%mm3           \n\t"
+            PAVGB" (%0), %%mm0              \n\t"
+            PAVGB" (%0, %2), %%mm1          \n\t"
+            PAVGB" (%0, %2, 2), %%mm2       \n\t"
+            PAVGB" (%0, %3), %%mm3          \n\t"
+            "movd %%mm0, (%1)               \n\t"
+            "movd %%mm1, (%1, %2)           \n\t"
+            "movd %%mm2, (%1, %2, 2)        \n\t"
+            "movd %%mm3, (%1, %3)           \n\t"
+            ::"S"(pixels), "D"(block),
+             "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+            :"memory");
+        block += 4*line_size;
+        pixels += 4*line_size;
+        h -= 4;
+    } while(h > 0);
+}
+
+//FIXME the following could be optimized too ...
+static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+#define QPEL_2TAP_L3(OPNAME) \
+static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
+    __asm__ volatile(\
+        "1:                    \n\t"\
+        "movq   (%1,%2), %%mm0 \n\t"\
+        "movq  8(%1,%2), %%mm1 \n\t"\
+        PAVGB"  (%1,%3), %%mm0 \n\t"\
+        PAVGB" 8(%1,%3), %%mm1 \n\t"\
+        PAVGB"  (%1),    %%mm0 \n\t"\
+        PAVGB" 8(%1),    %%mm1 \n\t"\
+        STORE_OP( (%1,%4),%%mm0)\
+        STORE_OP(8(%1,%4),%%mm1)\
+        "movq  %%mm0,  (%1,%4) \n\t"\
+        "movq  %%mm1, 8(%1,%4) \n\t"\
+        "add   %5, %1          \n\t"\
+        "decl  %0              \n\t"\
+        "jnz   1b              \n\t"\
+        :"+g"(h), "+r"(src)\
+        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
+         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
+        :"memory"\
+    );\
+}\
+static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
+    __asm__ volatile(\
+        "1:                    \n\t"\
+        "movq   (%1,%2), %%mm0 \n\t"\
+        PAVGB"  (%1,%3), %%mm0 \n\t"\
+        PAVGB"  (%1),    %%mm0 \n\t"\
+        STORE_OP((%1,%4),%%mm0)\
+        "movq  %%mm0,  (%1,%4) \n\t"\
+        "add   %5, %1          \n\t"\
+        "decl  %0              \n\t"\
+        "jnz   1b              \n\t"\
+        :"+g"(h), "+r"(src)\
+        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
+         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
+        :"memory"\
+    );\
+}
+
+#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
+QPEL_2TAP_L3(avg_)
+#undef STORE_OP
+#define STORE_OP(a,b)
+QPEL_2TAP_L3(put_)
+#undef STORE_OP
+#undef QPEL_2TAP_L3
--- a/i386/dsputil_mmx_qns.h	Mon Oct 27 00:25:19 2008 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-/*
- * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
- * Copyright (c) 2004 Michael Niedermayer
- *
- * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
- * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
-
-static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
-{
-    x86_reg i=0;
-
-    assert(FFABS(scale) < MAX_ABS);
-    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-
-    SET_RND(mm6);
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7              \n\t"
-        "movd  %4, %%mm5                \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        ASMALIGN(4)
-        "1:                             \n\t"
-        "movq  (%1, %0), %%mm0          \n\t"
-        "movq  8(%1, %0), %%mm1         \n\t"
-        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-        "paddw (%2, %0), %%mm0          \n\t"
-        "paddw 8(%2, %0), %%mm1         \n\t"
-        "psraw $6, %%mm0                \n\t"
-        "psraw $6, %%mm1                \n\t"
-        "pmullw (%3, %0), %%mm0         \n\t"
-        "pmullw 8(%3, %0), %%mm1        \n\t"
-        "pmaddwd %%mm0, %%mm0           \n\t"
-        "pmaddwd %%mm1, %%mm1           \n\t"
-        "paddd %%mm1, %%mm0             \n\t"
-        "psrld $4, %%mm0                \n\t"
-        "paddd %%mm0, %%mm7             \n\t"
-        "add $16, %0                    \n\t"
-        "cmp $128, %0                   \n\t" //FIXME optimize & bench
-        " jb 1b                         \n\t"
-        PHADDD(%%mm7, %%mm6)
-        "psrld $2, %%mm7                \n\t"
-        "movd %%mm7, %0                 \n\t"
-
-        : "+r" (i)
-        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
-    );
-    return i;
-}
-
-static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
-{
-    x86_reg i=0;
-
-    if(FFABS(scale) < MAX_ABS){
-        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-        SET_RND(mm6);
-        __asm__ volatile(
-                "movd  %3, %%mm5        \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                ASMALIGN(4)
-                "1:                     \n\t"
-                "movq  (%1, %0), %%mm0  \n\t"
-                "movq  8(%1, %0), %%mm1 \n\t"
-                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-                "paddw (%2, %0), %%mm0  \n\t"
-                "paddw 8(%2, %0), %%mm1 \n\t"
-                "movq %%mm0, (%2, %0)   \n\t"
-                "movq %%mm1, 8(%2, %0)  \n\t"
-                "add $16, %0            \n\t"
-                "cmp $128, %0           \n\t" // FIXME optimize & bench
-                " jb 1b                 \n\t"
-
-                : "+r" (i)
-                : "r"(basis), "r"(rem), "g"(scale)
-        );
-    }else{
-        for(i=0; i<8*8; i++){
-            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-        }
-    }
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/i386/dsputil_mmx_qns_template.c	Mon Oct 27 14:35:58 2008 +0000
@@ -0,0 +1,105 @@
+/*
+ * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
+ * Copyright (c) 2004 Michael Niedermayer
+ *
+ * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
+ * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* This header intentionally has no multiple inclusion guards. It is meant to
+ * be included multiple times and generates different code depending on the
+ * value of certain #defines. */
+
+#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
+
+static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    assert(FFABS(scale) < MAX_ABS);
+    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+    SET_RND(mm6);
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7              \n\t"
+        "movd  %4, %%mm5                \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        ASMALIGN(4)
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  8(%1, %0), %%mm1         \n\t"
+        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+        "paddw (%2, %0), %%mm0          \n\t"
+        "paddw 8(%2, %0), %%mm1         \n\t"
+        "psraw $6, %%mm0                \n\t"
+        "psraw $6, %%mm1                \n\t"
+        "pmullw (%3, %0), %%mm0         \n\t"
+        "pmullw 8(%3, %0), %%mm1        \n\t"
+        "pmaddwd %%mm0, %%mm0           \n\t"
+        "pmaddwd %%mm1, %%mm1           \n\t"
+        "paddd %%mm1, %%mm0             \n\t"
+        "psrld $4, %%mm0                \n\t"
+        "paddd %%mm0, %%mm7             \n\t"
+        "add $16, %0                    \n\t"
+        "cmp $128, %0                   \n\t" //FIXME optimize & bench
+        " jb 1b                         \n\t"
+        PHADDD(%%mm7, %%mm6)
+        "psrld $2, %%mm7                \n\t"
+        "movd %%mm7, %0                 \n\t"
+
+        : "+r" (i)
+        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+    );
+    return i;
+}
+
+static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    if(FFABS(scale) < MAX_ABS){
+        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+        SET_RND(mm6);
+        __asm__ volatile(
+                "movd  %3, %%mm5        \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                ASMALIGN(4)
+                "1:                     \n\t"
+                "movq  (%1, %0), %%mm0  \n\t"
+                "movq  8(%1, %0), %%mm1 \n\t"
+                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+                "paddw (%2, %0), %%mm0  \n\t"
+                "paddw 8(%2, %0), %%mm1 \n\t"
+                "movq %%mm0, (%2, %0)   \n\t"
+                "movq %%mm1, 8(%2, %0)  \n\t"
+                "add $16, %0            \n\t"
+                "cmp $128, %0           \n\t" // FIXME optimize & bench
+                " jb 1b                 \n\t"
+
+                : "+r" (i)
+                : "r"(basis), "r"(rem), "g"(scale)
+        );
+    }else{
+        for(i=0; i<8*8; i++){
+            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+        }
+    }
+}
--- a/i386/dsputil_mmx_rnd.h	Mon Oct 27 00:25:19 2008 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,594 +0,0 @@
-/*
- * DSP utils mmx functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard.
- * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
- * and improved by Zdenek Kabelac <kabi@users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-// put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"REG_a"     \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-        " jz 1f                         \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm5, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   16(%2), %%mm1           \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm2             \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $32, %2                 \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm5, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea        (%3, %3), %%"REG_a" \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-        " jz 1f                         \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   16(%2), %%mm1           \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"REG_a", %%"REG_a"    \n\t"
-        "add    %3, %1                  \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"           \n\t"
-
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-// avg_pixels
-static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movd  %0, %%mm0           \n\t"
-             "movd  %1, %%mm1           \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movd  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-// in case more speed is needed - unroling would certainly help
-static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  %2, %%mm1            \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*dst)
-            :"m"(*src1), "m"(*src2)
-            :"memory");
-        dst += dstStride;
-        src1 += src1Stride;
-        src2 += 8;
-    } while (--h);
-}
-
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  %2, %%mm1            \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  8%2, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*dst)
-            :"m"(*src1), "m"(*src2)
-            :"memory");
-        dst += dstStride;
-        src1 += src1Stride;
-        src2 += 16;
-    } while (--h);
-}
-
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"REG_a"     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm0, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm2, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-// this routine is 'slightly' suboptimal but mostly unused
-static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"REG_a", %%"REG_a"    \n\t"
-        "add    %3, %1                  \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
-                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"                \n\t"
-
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
-                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-//FIXME optimize
-static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-
-static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
-}
-
-static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-
-static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/i386/dsputil_mmx_rnd_template.c	Mon Oct 27 14:35:58 2008 +0000
@@ -0,0 +1,594 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* This header intentionally has no multiple inclusion guards. It is meant to
+ * be included multiple times and generates different code depending on the
+ * value of certain #defines. */
+
+// put_pixels
+static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"REG_a"     \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+        " jz 1f                         \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
+        "movq   %%mm4, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm5, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   16(%2), %%mm1           \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm2             \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $32, %2                 \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm5, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea        (%3, %3), %%"REG_a" \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+        " jz 1f                         \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "movq   8(%1), %%mm2            \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "movq   8(%1), %%mm2            \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   16(%2), %%mm1           \n\t"
+        "movq   8(%1), %%mm2            \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"REG_a", %%"REG_a"    \n\t"
+        "add    %3, %1                  \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+// avg_pixels
+static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movd  %0, %%mm0           \n\t"
+             "movd  %1, %%mm1           \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movd  %%mm2, %0           \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movq  %0, %%mm0           \n\t"
+             "movq  %1, %%mm1           \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, %0           \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movq  %0, %%mm0           \n\t"
+             "movq  %1, %%mm1           \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, %0           \n\t"
+             "movq  8%0, %%mm0          \n\t"
+             "movq  8%1, %%mm1          \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, 8%0          \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  1%1, %%mm1           \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            :"+m"(*block)
+            :"m"(*pixels)
+            :"memory");
+        pixels += line_size;
+        block += line_size;
+    } while (--h);
+}
+
+static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  %2, %%mm1            \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            :"+m"(*dst)
+            :"m"(*src1), "m"(*src2)
+            :"memory");
+        dst += dstStride;
+        src1 += src1Stride;
+        src2 += 8;
+    } while (--h);
+}
+
+static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  1%1, %%mm1           \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            "movq  8%1, %%mm0           \n\t"
+            "movq  9%1, %%mm1           \n\t"
+            "movq  8%0, %%mm3           \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8%0           \n\t"
+            :"+m"(*block)
+            :"m"(*pixels)
+            :"memory");
+        pixels += line_size;
+        block += line_size;
+    } while (--h);
+}
+
+static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  %2, %%mm1            \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            "movq  8%1, %%mm0           \n\t"
+            "movq  8%2, %%mm1           \n\t"
+            "movq  8%0, %%mm3           \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8%0           \n\t"
+            :"+m"(*dst)
+            :"m"(*src1), "m"(*src2)
+            :"memory");
+        dst += dstStride;
+        src1 += src1Stride;
+        src2 += 16;
+    } while (--h);
+}
+
+static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"REG_a"     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm0, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm2, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"REG_a", %%"REG_a"    \n\t"
+        "add    %3, %1                  \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
+                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"                \n\t"
+
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
+                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+//FIXME optimize
+static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
--- a/i386/dsputilenc_mmx.c	Mon Oct 27 00:25:19 2008 +0000
+++ b/i386/dsputilenc_mmx.c	Mon Oct 27 14:35:58 2008 +0000
@@ -1304,7 +1304,7 @@
 #define SET_RND MOVQ_WONE
 #define SCALE_OFFSET 1

-#include "dsputil_mmx_qns.h"
+#include "dsputil_mmx_qns_template.c"

 #undef DEF
 #undef SET_RND
@@ -1318,7 +1318,7 @@
     "pmulhrw " #s ", "#x "           \n\t"\
     "pmulhrw " #s ", "#y "           \n\t"

-#include "dsputil_mmx_qns.h"
+#include "dsputil_mmx_qns_template.c"

 #undef DEF
 #undef SET_RND
@@ -1337,7 +1337,7 @@
     "pmulhrsw " #s ", "#x "          \n\t"\
     "pmulhrsw " #s ", "#y "          \n\t"

-#include "dsputil_mmx_qns.h"
+#include "dsputil_mmx_qns_template.c"

 #undef DEF
 #undef SET_RND
--- a/imgconvert.c	Mon Oct 27 00:25:19 2008 +0000
+++ b/imgconvert.c	Mon Oct 27 14:35:58 2008 +0000
@@ -1416,7 +1416,7 @@

 #define BPP 2

-#include "imgconvert_template.h"
+#include "imgconvert_template.c"

 /* rgb565 handling */

@@ -1437,7 +1437,7 @@

 #define BPP 2

-#include "imgconvert_template.h"
+#include "imgconvert_template.c"

 /* bgr24 handling */

@@ -1459,7 +1459,7 @@

 #define BPP 3

-#include "imgconvert_template.h"
+#include "imgconvert_template.c"

 #undef RGB_IN
 #undef RGB_OUT
@@ -1486,7 +1486,7 @@

 #define BPP 3

-#include "imgconvert_template.h"
+#include "imgconvert_template.c"

 /* rgb32 handling */

@@ -1517,7 +1517,7 @@

 #define BPP 4

-#include "imgconvert_template.h"
+#include "imgconvert_template.c"

 static void mono_to_gray(AVPicture *dst, const AVPicture *src,
                          int width, int height, int xor_mask)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/imgconvert_template.c	Mon Oct 27 14:35:58 2008 +0000
@@ -0,0 +1,879 @@
+/*
+ * templates for image conversion routines
+ * Copyright (c) 2001, 2002, 2003 Fabrice Bellard.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* This header intentionally has no multiple inclusion guards. It is meant to
+ * be included multiple times and generates different code depending on the
+ * value of certain #defines. */
+
+#ifndef RGB_OUT
+#define RGB_OUT(d, r, g, b) RGBA_OUT(d, r, g, b, 0xff)
+#endif
+
+static void glue(yuv420p_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
+                                        int width, int height)
+{
+    const uint8_t *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr;
+    uint8_t *d, *d1, *d2;
+    int w, y, cb, cr, r_add, g_add, b_add, width2;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    unsigned int r, g, b;
+
+    d = dst->data[0];
+    y1_ptr = src->data[0];
+    cb_ptr = src->data[1];
+    cr_ptr = src->data[2];
+    width2 = (width + 1) >> 1;
+    for(;height >= 2; height -= 2) {
+        d1 = d;
+        d2 = d + dst->linesize[0];
+        y2_ptr = y1_ptr + src->linesize[0];
+        for(w = width; w >= 2; w -= 2) {
+            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
+            /* output 4 pixels */
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[1]);
+            RGB_OUT(d1 + BPP, r, g, b);
+
+            YUV_TO_RGB2_CCIR(r, g, b, y2_ptr[0]);
+            RGB_OUT(d2, r, g, b);
+
+            YUV_TO_RGB2_CCIR(r, g, b, y2_ptr[1]);
+            RGB_OUT(d2 + BPP, r, g, b);
+
+            d1 += 2 * BPP;
+            d2 += 2 * BPP;
+
+            y1_ptr += 2;
+            y2_ptr += 2;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        /* handle odd width */
+        if (w) {
+            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+
+            YUV_TO_RGB2_CCIR(r, g, b, y2_ptr[0]);
+            RGB_OUT(d2, r, g, b);
+            d1 += BPP;
+            d2 += BPP;
+            y1_ptr++;
+            y2_ptr++;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        d += 2 * dst->linesize[0];
+        y1_ptr += 2 * src->linesize[0] - width;
+        cb_ptr += src->linesize[1] - width2;
+        cr_ptr += src->linesize[2] - width2;
+    }
+    /* handle odd height */
+    if (height) {
+        d1 = d;
+        for(w = width; w >= 2; w -= 2) {
+            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
+            /* output 2 pixels */
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[1]);
+            RGB_OUT(d1 + BPP, r, g, b);
+
+            d1 += 2 * BPP;
+
+            y1_ptr += 2;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        /* handle width */
+        if (w) {
+            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
+            /* output 2 pixels */
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+            d1 += BPP;
+
+            y1_ptr++;
+            cb_ptr++;
+            cr_ptr++;
+        }
+    }
+}
+
+static void glue(yuvj420p_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
+                                         int width, int height)
+{
+    const uint8_t *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr;
+    uint8_t *d, *d1, *d2;
+    int w, y, cb, cr, r_add, g_add, b_add, width2;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    unsigned int r, g, b;
+
+    d = dst->data[0];
+    y1_ptr = src->data[0];
+    cb_ptr = src->data[1];
+    cr_ptr = src->data[2];
+    width2 = (width + 1) >> 1;
+    for(;height >= 2; height -= 2) {
+        d1 = d;
+        d2 = d + dst->linesize[0];
+        y2_ptr = y1_ptr + src->linesize[0];
+        for(w = width; w >= 2; w -= 2) {
+            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
+            /* output 4 pixels */
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+
+            YUV_TO_RGB2(r, g, b, y1_ptr[1]);
+            RGB_OUT(d1 + BPP, r, g, b);
+
+            YUV_TO_RGB2(r, g, b, y2_ptr[0]);
+            RGB_OUT(d2, r, g, b);
+
+            YUV_TO_RGB2(r, g, b, y2_ptr[1]);
+            RGB_OUT(d2 + BPP, r, g, b);
+
+            d1 += 2 * BPP;
+            d2 += 2 * BPP;
+
+            y1_ptr += 2;
+            y2_ptr += 2;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        /* handle odd width */
+        if (w) {
+            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+
+            YUV_TO_RGB2(r, g, b, y2_ptr[0]);
+            RGB_OUT(d2, r, g, b);
+            d1 += BPP;
+            d2 += BPP;
+            y1_ptr++;
+            y2_ptr++;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        d += 2 * dst->linesize[0];
+        y1_ptr += 2 * src->linesize[0] - width;
+        cb_ptr += src->linesize[1] - width2;
+        cr_ptr += src->linesize[2] - width2;
+    }
+    /* handle odd height */
+    if (height) {
+        d1 = d;
+        for(w = width; w >= 2; w -= 2) {
+            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
+            /* output 2 pixels */
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+
+            YUV_TO_RGB2(r, g, b, y1_ptr[1]);
+            RGB_OUT(d1 + BPP, r, g, b);
+
+            d1 += 2 * BPP;
+
+            y1_ptr += 2;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        /* handle width */
+        if (w) {
+            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
+            /* output 2 pixels */
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+            d1 += BPP;
+
+            y1_ptr++;
+            cb_ptr++;
+            cr_ptr++;
+        }
+    }
+}
+
+static void glue(RGB_NAME, _to_yuv420p)(AVPicture *dst, const AVPicture *src,
+                                        int width, int height)
+{
+    int wrap, wrap3, width2;
+    int r, g, b, r1, g1, b1, w;
+    uint8_t *lum, *cb, *cr;
+    const uint8_t *p;
+
+    lum = dst->data[0];
+    cb = dst->data[1];
+    cr = dst->data[2];
+
+    width2 = (width + 1) >> 1;
+    wrap = dst->linesize[0];
+    wrap3 = src->linesize[0];
+    p = src->data[0];
+    for(;height>=2;height -= 2) {
+        for(w = width; w >= 2; w -= 2) {
+            RGB_IN(r, g, b, p);
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+
+            RGB_IN(r, g, b, p + BPP);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = RGB_TO_Y_CCIR(r, g, b);
+            p += wrap3;
+            lum += wrap;
+
+            RGB_IN(r, g, b, p);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+
+            RGB_IN(r, g, b, p + BPP);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = RGB_TO_Y_CCIR(r, g, b);
+
+            cb[0] = RGB_TO_U_CCIR(r1, g1, b1, 2);
+            cr[0] = RGB_TO_V_CCIR(r1, g1, b1, 2);
+
+            cb++;
+            cr++;
+            p += -wrap3 + 2 * BPP;
+            lum += -wrap + 2;
+        }
+        if (w) {
+            RGB_IN(r, g, b, p);
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+            p += wrap3;
+            lum += wrap;
+            RGB_IN(r, g, b, p);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+            cb[0] = RGB_TO_U_CCIR(r1, g1, b1, 1);
+            cr[0] = RGB_TO_V_CCIR(r1, g1, b1, 1);
+            cb++;
+            cr++;
+            p += -wrap3 + BPP;
+            lum += -wrap + 1;
+        }
+        p += wrap3 + (wrap3 - width * BPP);
+        lum += wrap + (wrap - width);
+        cb += dst->linesize[1] - width2;
+        cr += dst->linesize[2] - width2;
+    }
+    /* handle odd height */
+    if (height) {
+        for(w = width; w >= 2; w -= 2) {
+            RGB_IN(r, g, b, p);
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+
+            RGB_IN(r, g, b, p + BPP);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = RGB_TO_Y_CCIR(r, g, b);
+            cb[0] = RGB_TO_U_CCIR(r1, g1, b1, 1);
+            cr[0] = RGB_TO_V_CCIR(r1, g1, b1, 1);
+            cb++;
+            cr++;
+            p += 2 * BPP;
+           lum += 2;
+        }
+        if (w) {
+            RGB_IN(r, g, b, p);
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+            cb[0] = RGB_TO_U_CCIR(r, g, b, 0);
+            cr[0] = RGB_TO_V_CCIR(r, g, b, 0);
+        }
+    }
+}
+
+static void glue(RGB_NAME, _to_gray)(AVPicture *dst, const AVPicture *src,
+                                     int width, int height)
+{
+    const unsigned char *p;
+    unsigned char *q;
+    int r, g, b, dst_wrap, src_wrap;
+    int x, y;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - BPP * width;
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - width;
+
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            RGB_IN(r, g, b, p);
+            q[0] = RGB_TO_Y(r, g, b);
+            q++;
+            p += BPP;
+        }
+        p += src_wrap;
+        q += dst_wrap;
+    }
+}
+
+static void glue(gray_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
+                                     int width, int height)
+{
+    const unsigned char *p;
+    unsigned char *q;
+    int r, dst_wrap, src_wrap;
+    int x, y;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - width;
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - BPP * width;
+
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            r = p[0];
+            RGB_OUT(q, r, r, r);
+            q += BPP;
+            p ++;
+        }
+        p += src_wrap;
+        q += dst_wrap;
+    }
+}
+
+static void glue(pal8_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
+                                     int width, int height)
+{
+    const unsigned char *p;
+    unsigned char *q;
+    int r, g, b, dst_wrap, src_wrap;
+    int x, y;
+    uint32_t v;
+    const uint32_t *palette;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - width;
+    palette = (uint32_t *)src->data[1];
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - BPP * width;
+
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            v = palette[p[0]];
+            r = (v >> 16) & 0xff;
+            g = (v >> 8) & 0xff;
+            b = (v) & 0xff;
+#ifdef RGBA_OUT
+            {
+                int a;
+                a = (v >> 24) & 0xff;
+                RGBA_OUT(q, r, g, b, a);
+            }
+#else
+            RGB_OUT(q, r, g, b);
+#endif
+            q += BPP;
+            p ++;
+        }
+        p += src_wrap;
+        q += dst_wrap;
+    }
+}
+
+// RGB24 has optimized routines
+#if !defined(FMT_RGB32) && !defined(FMT_RGB24)
+/* alpha support */
+
+static void glue(rgb32_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
+                                      int width, int height)
+{
+    const uint8_t *s;
+    uint8_t *d;
+    int src_wrap, dst_wrap, j, y;
+    unsigned int v, r, g, b;
+#ifdef RGBA_OUT
+    unsigned int a;
+#endif
+
+    s = src->data[0];
+    src_wrap = src->linesize[0] - width * 4;
+
+    d = dst->data[0];
+    dst_wrap = dst->linesize[0] - width * BPP;
+
+    for(y=0;y<height;y++) {
+        for(j = 0;j < width; j++) {
+            v = ((const uint32_t *)(s))[0];
+            r = (v >> 16) & 0xff;
+            g = (v >> 8) & 0xff;
+            b = v & 0xff;
+#ifdef RGBA_OUT
+            a = (v >> 24) & 0xff;
+            RGBA_OUT(d, r, g, b, a);
+#else
+            RGB_OUT(d, r, g, b);
+#endif
+            s += 4;
+            d += BPP;
+        }
+        s += src_wrap;
+        d += dst_wrap;
+    }
+}
+
+static void glue(RGB_NAME, _to_rgb32)(AVPicture *dst, const AVPicture *src,
+                                       int width, int height)
+{
+    const uint8_t *s;
+    uint8_t *d;
+    int src_wrap, dst_wrap, j, y;
+    unsigned int r, g, b;
+#ifdef RGBA_IN
+    unsigned int a;
+#endif
+
+    s = src->data[0];
+    src_wrap = src->linesize[0] - width * BPP;
+
+    d = dst->data[0];
+    dst_wrap = dst->linesize[0] - width * 4;
+
+    for(y=0;y<height;y++) {
+        for(j = 0;j < width; j++) {
+#ifdef RGBA_IN
+            RGBA_IN(r, g, b, a, s);
+            ((uint32_t *)(d))[0] = (a << 24) | (r << 16) | (g << 8) | b;
+#else
+            RGB_IN(r, g, b, s);
+            ((uint32_t *)(d))[0] = (0xff << 24) | (r << 16) | (g << 8) | b;
+#endif
+            d += 4;
+            s += BPP;
+        }
+        s += src_wrap;
+        d += dst_wrap;
+    }
+}
+
+#endif /* !defined(FMT_RGB32) */
+
+#ifndef FMT_RGB24
+
+static void glue(rgb24_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
+                                      int width, int height)
+{
+    const uint8_t *s;
+    uint8_t *d;
+    int src_wrap, dst_wrap, j, y;
+    unsigned int r, g, b;
+
+    s = src->data[0];
+    src_wrap = src->linesize[0] - width * 3;
+
+    d = dst->data[0];
+    dst_wrap = dst->linesize[0] - width * BPP;
+
+    for(y=0;y<height;y++) {
+        for(j = 0;j < width; j++) {
+            r = s[0];
+            g = s[1];
+            b = s[2];
+            RGB_OUT(d, r, g, b);
+            s += 3;
+            d += BPP;
+        }
+        s += src_wrap;
+        d += dst_wrap;
+    }
+}
+
+static void glue(RGB_NAME, _to_rgb24)(AVPicture *dst, const AVPicture *src,
+                                      int width, int height)
+{
+    const uint8_t *s;
+    uint8_t *d;
+    int src_wrap, dst_wrap, j, y;
+    unsigned int r, g , b;
+
+    s = src->data[0];
+    src_wrap = src->linesize[0] - width * BPP;
+
+    d = dst->data[0];
+    dst_wrap = dst->linesize[0] - width * 3;
+
+    for(y=0;y<height;y++) {
+        for(j = 0;j < width; j++) {
+            RGB_IN(r, g, b, s)
+            d[0] = r;
+            d[1] = g;
+            d[2] = b;
+            d += 3;
+            s += BPP;
+        }
+        s += src_wrap;
+        d += dst_wrap;
+    }
+}
+
+#endif /* !FMT_RGB24 */
+
+#ifdef FMT_RGB24
+
+static void yuv444p_to_rgb24(AVPicture *dst, const AVPicture *src,
+                             int width, int height)
+{
+    const uint8_t *y1_ptr, *cb_ptr, *cr_ptr;
+    uint8_t *d, *d1;
+    int w, y, cb, cr, r_add, g_add, b_add;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    unsigned int r, g, b;
+
+    d = dst->data[0];
+    y1_ptr = src->data[0];
+    cb_ptr = src->data[1];
+    cr_ptr = src->data[2];
+    for(;height > 0; height --) {
+        d1 = d;
+        for(w = width; w > 0; w--) {
+            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
+
+            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+            d1 += BPP;
+
+            y1_ptr++;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        d += dst->linesize[0];
+        y1_ptr += src->linesize[0] - width;
+        cb_ptr += src->linesize[1] - width;
+        cr_ptr += src->linesize[2] - width;
+    }
+}
+
+static void yuvj444p_to_rgb24(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    const uint8_t *y1_ptr, *cb_ptr, *cr_ptr;
+    uint8_t *d, *d1;
+    int w, y, cb, cr, r_add, g_add, b_add;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    unsigned int r, g, b;
+
+    d = dst->data[0];
+    y1_ptr = src->data[0];
+    cb_ptr = src->data[1];
+    cr_ptr = src->data[2];
+    for(;height > 0; height --) {
+        d1 = d;
+        for(w = width; w > 0; w--) {
+            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
+
+            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
+            RGB_OUT(d1, r, g, b);
+            d1 += BPP;
+
+            y1_ptr++;
+            cb_ptr++;
+            cr_ptr++;
+        }
+        d += dst->linesize[0];
+        y1_ptr += src->linesize[0] - width;
+        cb_ptr += src->linesize[1] - width;
+        cr_ptr += src->linesize[2] - width;
+    }
+}
+
+static void rgb24_to_yuv444p(AVPicture *dst, const AVPicture *src,
+                             int width, int height)
+{
+    int src_wrap, x, y;
+    int r, g, b;
+    uint8_t *lum, *cb, *cr;
+    const uint8_t *p;
+
+    lum = dst->data[0];
+    cb = dst->data[1];
+    cr = dst->data[2];
+
+    src_wrap = src->linesize[0] - width * BPP;
+    p = src->data[0];
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            RGB_IN(r, g, b, p);
+            lum[0] = RGB_TO_Y_CCIR(r, g, b);
+            cb[0] = RGB_TO_U_CCIR(r, g, b, 0);
+            cr[0] = RGB_TO_V_CCIR(r, g, b, 0);
+            p += BPP;
+            cb++;
+            cr++;
+            lum++;
+        }
+        p += src_wrap;
+        lum += dst->linesize[0] - width;
+        cb += dst->linesize[1] - width;
+        cr += dst->linesize[2] - width;
+    }
+}
+
+static void rgb24_to_yuvj420p(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    int wrap, wrap3, width2;
+    int r, g, b, r1, g1, b1, w;
+    uint8_t *lum, *cb, *cr;
+    const uint8_t *p;
+
+    lum = dst->data[0];
+    cb = dst->data[1];
+    cr = dst->data[2];
+
+    width2 = (width + 1) >> 1;
+    wrap = dst->linesize[0];
+    wrap3 = src->linesize[0];
+    p = src->data[0];
+    for(;height>=2;height -= 2) {
+        for(w = width; w >= 2; w -= 2) {
+            RGB_IN(r, g, b, p);
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = RGB_TO_Y(r, g, b);
+
+            RGB_IN(r, g, b, p + BPP);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = RGB_TO_Y(r, g, b);
+            p += wrap3;
+            lum += wrap;
+
+            RGB_IN(r, g, b, p);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[0] = RGB_TO_Y(r, g, b);
+
+            RGB_IN(r, g, b, p + BPP);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = RGB_TO_Y(r, g, b);
+
+            cb[0] = RGB_TO_U(r1, g1, b1, 2);
+            cr[0] = RGB_TO_V(r1, g1, b1, 2);
+
+            cb++;
+            cr++;
+            p += -wrap3 + 2 * BPP;
+            lum += -wrap + 2;
+        }
+        if (w) {
+            RGB_IN(r, g, b, p);
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = RGB_TO_Y(r, g, b);
+            p += wrap3;
+            lum += wrap;
+            RGB_IN(r, g, b, p);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[0] = RGB_TO_Y(r, g, b);
+            cb[0] = RGB_TO_U(r1, g1, b1, 1);
+            cr[0] = RGB_TO_V(r1, g1, b1, 1);
+            cb++;
+            cr++;
+            p += -wrap3 + BPP;
+            lum += -wrap + 1;
+        }
+        p += wrap3 + (wrap3 - width * BPP);
+        lum += wrap + (wrap - width);
+        cb += dst->linesize[1] - width2;
+        cr += dst->linesize[2] - width2;
+    }
+    /* handle odd height */
+    if (height) {
+        for(w = width; w >= 2; w -= 2) {
+            RGB_IN(r, g, b, p);
+            r1 = r;
+            g1 = g;
+            b1 = b;
+            lum[0] = RGB_TO_Y(r, g, b);
+
+            RGB_IN(r, g, b, p + BPP);
+            r1 += r;
+            g1 += g;
+            b1 += b;
+            lum[1] = RGB_TO_Y(r, g, b);
+            cb[0] = RGB_TO_U(r1, g1, b1, 1);
+            cr[0] = RGB_TO_V(r1, g1, b1, 1);
+            cb++;
+            cr++;
+            p += 2 * BPP;
+           lum += 2;
+        }
+        if (w) {
+            RGB_IN(r, g, b, p);
+            lum[0] = RGB_TO_Y(r, g, b);
+            cb[0] = RGB_TO_U(r, g, b, 0);
+            cr[0] = RGB_TO_V(r, g, b, 0);
+        }
+    }
+}
+
+static void rgb24_to_yuvj444p(AVPicture *dst, const AVPicture *src,
+                              int width, int height)
+{
+    int src_wrap, x, y;
+    int r, g, b;
+    uint8_t *lum, *cb, *cr;
+    const uint8_t *p;
+
+    lum = dst->data[0];
+    cb = dst->data[1];
+    cr = dst->data[2];
+
+    src_wrap = src->linesize[0] - width * BPP;
+    p = src->data[0];
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            RGB_IN(r, g, b, p);
+            lum[0] = RGB_TO_Y(r, g, b);
+            cb[0] = RGB_TO_U(r, g, b, 0);
+            cr[0] = RGB_TO_V(r, g, b, 0);
+            p += BPP;
+            cb++;
+            cr++;
+            lum++;
+        }
+        p += src_wrap;
+        lum += dst->linesize[0] - width;
+        cb += dst->linesize[1] - width;
+        cr += dst->linesize[2] - width;
+    }
+}
+
+#endif /* FMT_RGB24 */
+
+#if defined(FMT_RGB24) || defined(FMT_RGB32)
+
+static void glue(RGB_NAME, _to_pal8)(AVPicture *dst, const AVPicture *src,
+                                     int width, int height)
+{
+    const unsigned char *p;
+    unsigned char *q;
+    int dst_wrap, src_wrap;
+    int x, y, has_alpha;
+    unsigned int r, g, b;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - BPP * width;
+
+    q = dst->data[0];
+    dst_wrap = dst->linesize[0] - width;
+    has_alpha = 0;
+
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+#ifdef RGBA_IN
+            {
+                unsigned int a;
+                RGBA_IN(r, g, b, a, p);
+                /* crude approximation for alpha ! */
+                if (a < 0x80) {
+                    has_alpha = 1;
+                    q[0] = TRANSP_INDEX;
+                } else {
+                    q[0] = gif_clut_index(r, g, b);
+                }
+            }
+#else
+            RGB_IN(r, g, b, p);
+            q[0] = gif_clut_index(r, g, b);
+#endif
+            q++;
+            p += BPP;
+        }
+        p += src_wrap;
+        q += dst_wrap;
+    }
+
+    build_rgb_palette(dst->data[1], has_alpha);
+}
+
+#endif /* defined(FMT_RGB24) || defined(FMT_RGB32) */
+
+#ifdef RGBA_IN
+
+static int glue(get_alpha_info_, RGB_NAME)(const AVPicture *src,
+                                           int width, int height)
+{
+    const unsigned char *p;
+    int src_wrap, ret, x, y;
+    unsigned int r, g, b, a;
+
+    p = src->data[0];
+    src_wrap = src->linesize[0] - BPP * width;
+    ret = 0;
+    for(y=0;y<height;y++) {
+        for(x=0;x<width;x++) {
+            RGBA_IN(r, g, b, a, p);
+            if (a == 0x00) {
+                ret |= FF_ALPHA_TRANSP;
+            } else if (a != 0xff) {
+                ret |= FF_ALPHA_SEMI_TRANSP;
+            }
+            p += BPP;
+        }
+        p += src_wrap;
+    }
+    return ret;
+}
+
+#endif /* RGBA_IN */
+
+#undef RGB_IN
+#undef RGBA_IN
+#undef RGB_OUT
+#undef RGBA_OUT
+#undef BPP
+#undef RGB_NAME
+#undef FMT_RGB24
+#undef FMT_RGB32
--- a/imgconvert_template.h	Mon Oct 27 00:25:19 2008 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,879 +0,0 @@
-/*
- * templates for image conversion routines
- * Copyright (c) 2001, 2002, 2003 Fabrice Bellard.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-#ifndef RGB_OUT
-#define RGB_OUT(d, r, g, b) RGBA_OUT(d, r, g, b, 0xff)
-#endif
-
-static void glue(yuv420p_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
-                                        int width, int height)
-{
-    const uint8_t *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr;
-    uint8_t *d, *d1, *d2;
-    int w, y, cb, cr, r_add, g_add, b_add, width2;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    unsigned int r, g, b;
-
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    width2 = (width + 1) >> 1;
-    for(;height >= 2; height -= 2) {
-        d1 = d;
-        d2 = d + dst->linesize[0];
-        y2_ptr = y1_ptr + src->linesize[0];
-        for(w = width; w >= 2; w -= 2) {
-            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
-            /* output 4 pixels */
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[1]);
-            RGB_OUT(d1 + BPP, r, g, b);
-
-            YUV_TO_RGB2_CCIR(r, g, b, y2_ptr[0]);
-            RGB_OUT(d2, r, g, b);
-
-            YUV_TO_RGB2_CCIR(r, g, b, y2_ptr[1]);
-            RGB_OUT(d2 + BPP, r, g, b);
-
-            d1 += 2 * BPP;
-            d2 += 2 * BPP;
-
-            y1_ptr += 2;
-            y2_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        /* handle odd width */
-        if (w) {
-            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-
-            YUV_TO_RGB2_CCIR(r, g, b, y2_ptr[0]);
-            RGB_OUT(d2, r, g, b);
-            d1 += BPP;
-            d2 += BPP;
-            y1_ptr++;
-            y2_ptr++;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        d += 2 * dst->linesize[0];
-        y1_ptr += 2 * src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width2;
-        cr_ptr += src->linesize[2] - width2;
-    }
-    /* handle odd height */
-    if (height) {
-        d1 = d;
-        for(w = width; w >= 2; w -= 2) {
-            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
-            /* output 2 pixels */
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[1]);
-            RGB_OUT(d1 + BPP, r, g, b);
-
-            d1 += 2 * BPP;
-
-            y1_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        /* handle width */
-        if (w) {
-            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
-            /* output 2 pixels */
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-            d1 += BPP;
-
-            y1_ptr++;
-            cb_ptr++;
-            cr_ptr++;
-        }
-    }
-}
-
-static void glue(yuvj420p_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
-                                         int width, int height)
-{
-    const uint8_t *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr;
-    uint8_t *d, *d1, *d2;
-    int w, y, cb, cr, r_add, g_add, b_add, width2;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    unsigned int r, g, b;
-
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    width2 = (width + 1) >> 1;
-    for(;height >= 2; height -= 2) {
-        d1 = d;
-        d2 = d + dst->linesize[0];
-        y2_ptr = y1_ptr + src->linesize[0];
-        for(w = width; w >= 2; w -= 2) {
-            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
-            /* output 4 pixels */
-            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-
-            YUV_TO_RGB2(r, g, b, y1_ptr[1]);
-            RGB_OUT(d1 + BPP, r, g, b);
-
-            YUV_TO_RGB2(r, g, b, y2_ptr[0]);
-            RGB_OUT(d2, r, g, b);
-
-            YUV_TO_RGB2(r, g, b, y2_ptr[1]);
-            RGB_OUT(d2 + BPP, r, g, b);
-
-            d1 += 2 * BPP;
-            d2 += 2 * BPP;
-
-            y1_ptr += 2;
-            y2_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        /* handle odd width */
-        if (w) {
-            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
-            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-
-            YUV_TO_RGB2(r, g, b, y2_ptr[0]);
-            RGB_OUT(d2, r, g, b);
-            d1 += BPP;
-            d2 += BPP;
-            y1_ptr++;
-            y2_ptr++;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        d += 2 * dst->linesize[0];
-        y1_ptr += 2 * src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width2;
-        cr_ptr += src->linesize[2] - width2;
-    }
-    /* handle odd height */
-    if (height) {
-        d1 = d;
-        for(w = width; w >= 2; w -= 2) {
-            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
-            /* output 2 pixels */
-            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-
-            YUV_TO_RGB2(r, g, b, y1_ptr[1]);
-            RGB_OUT(d1 + BPP, r, g, b);
-
-            d1 += 2 * BPP;
-
-            y1_ptr += 2;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        /* handle width */
-        if (w) {
-            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
-            /* output 2 pixels */
-            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-            d1 += BPP;
-
-            y1_ptr++;
-            cb_ptr++;
-            cr_ptr++;
-        }
-    }
-}
-
-static void glue(RGB_NAME, _to_yuv420p)(AVPicture *dst, const AVPicture *src,
-                                        int width, int height)
-{
-    int wrap, wrap3, width2;
-    int r, g, b, r1, g1, b1, w;
-    uint8_t *lum, *cb, *cr;
-    const uint8_t *p;
-
-    lum = dst->data[0];
-    cb = dst->data[1];
-    cr = dst->data[2];
-
-    width2 = (width + 1) >> 1;
-    wrap = dst->linesize[0];
-    wrap3 = src->linesize[0];
-    p = src->data[0];
-    for(;height>=2;height -= 2) {
-        for(w = width; w >= 2; w -= 2) {
-            RGB_IN(r, g, b, p);
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-
-            RGB_IN(r, g, b, p + BPP);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = RGB_TO_Y_CCIR(r, g, b);
-            p += wrap3;
-            lum += wrap;
-
-            RGB_IN(r, g, b, p);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-
-            RGB_IN(r, g, b, p + BPP);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = RGB_TO_Y_CCIR(r, g, b);
-
-            cb[0] = RGB_TO_U_CCIR(r1, g1, b1, 2);
-            cr[0] = RGB_TO_V_CCIR(r1, g1, b1, 2);
-
-            cb++;
-            cr++;
-            p += -wrap3 + 2 * BPP;
-            lum += -wrap + 2;
-        }
-        if (w) {
-            RGB_IN(r, g, b, p);
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-            p += wrap3;
-            lum += wrap;
-            RGB_IN(r, g, b, p);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-            cb[0] = RGB_TO_U_CCIR(r1, g1, b1, 1);
-            cr[0] = RGB_TO_V_CCIR(r1, g1, b1, 1);
-            cb++;
-            cr++;
-            p += -wrap3 + BPP;
-            lum += -wrap + 1;
-        }
-        p += wrap3 + (wrap3 - width * BPP);
-        lum += wrap + (wrap - width);
-        cb += dst->linesize[1] - width2;
-        cr += dst->linesize[2] - width2;
-    }
-    /* handle odd height */
-    if (height) {
-        for(w = width; w >= 2; w -= 2) {
-            RGB_IN(r, g, b, p);
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-
-            RGB_IN(r, g, b, p + BPP);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = RGB_TO_Y_CCIR(r, g, b);
-            cb[0] = RGB_TO_U_CCIR(r1, g1, b1, 1);
-            cr[0] = RGB_TO_V_CCIR(r1, g1, b1, 1);
-            cb++;
-            cr++;
-            p += 2 * BPP;
-           lum += 2;
-        }
-        if (w) {
-            RGB_IN(r, g, b, p);
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-            cb[0] = RGB_TO_U_CCIR(r, g, b, 0);
-            cr[0] = RGB_TO_V_CCIR(r, g, b, 0);
-        }
-    }
-}
-
-static void glue(RGB_NAME, _to_gray)(AVPicture *dst, const AVPicture *src,
-                                     int width, int height)
-{
-    const unsigned char *p;
-    unsigned char *q;
-    int r, g, b, dst_wrap, src_wrap;
-    int x, y;
-
-    p = src->data[0];
-    src_wrap = src->linesize[0] - BPP * width;
-
-    q = dst->data[0];
-    dst_wrap = dst->linesize[0] - width;
-
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-            RGB_IN(r, g, b, p);
-            q[0] = RGB_TO_Y(r, g, b);
-            q++;
-            p += BPP;
-        }
-        p += src_wrap;
-        q += dst_wrap;
-    }
-}
-
-static void glue(gray_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
-                                     int width, int height)
-{
-    const unsigned char *p;
-    unsigned char *q;
-    int r, dst_wrap, src_wrap;
-    int x, y;
-
-    p = src->data[0];
-    src_wrap = src->linesize[0] - width;
-
-    q = dst->data[0];
-    dst_wrap = dst->linesize[0] - BPP * width;
-
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-            r = p[0];
-            RGB_OUT(q, r, r, r);
-            q += BPP;
-            p ++;
-        }
-        p += src_wrap;
-        q += dst_wrap;
-    }
-}
-
-static void glue(pal8_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
-                                     int width, int height)
-{
-    const unsigned char *p;
-    unsigned char *q;
-    int r, g, b, dst_wrap, src_wrap;
-    int x, y;
-    uint32_t v;
-    const uint32_t *palette;
-
-    p = src->data[0];
-    src_wrap = src->linesize[0] - width;
-    palette = (uint32_t *)src->data[1];
-
-    q = dst->data[0];
-    dst_wrap = dst->linesize[0] - BPP * width;
-
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-            v = palette[p[0]];
-            r = (v >> 16) & 0xff;
-            g = (v >> 8) & 0xff;
-            b = (v) & 0xff;
-#ifdef RGBA_OUT
-            {
-                int a;
-                a = (v >> 24) & 0xff;
-                RGBA_OUT(q, r, g, b, a);
-            }
-#else
-            RGB_OUT(q, r, g, b);
-#endif
-            q += BPP;
-            p ++;
-        }
-        p += src_wrap;
-        q += dst_wrap;
-    }
-}
-
-// RGB24 has optimized routines
-#if !defined(FMT_RGB32) && !defined(FMT_RGB24)
-/* alpha support */
-
-static void glue(rgb32_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
-                                      int width, int height)
-{
-    const uint8_t *s;
-    uint8_t *d;
-    int src_wrap, dst_wrap, j, y;
-    unsigned int v, r, g, b;
-#ifdef RGBA_OUT
-    unsigned int a;
-#endif
-
-    s = src->data[0];
-    src_wrap = src->linesize[0] - width * 4;
-
-    d = dst->data[0];
-    dst_wrap = dst->linesize[0] - width * BPP;
-
-    for(y=0;y<height;y++) {
-        for(j = 0;j < width; j++) {
-            v = ((const uint32_t *)(s))[0];
-            r = (v >> 16) & 0xff;
-            g = (v >> 8) & 0xff;
-            b = v & 0xff;
-#ifdef RGBA_OUT
-            a = (v >> 24) & 0xff;
-            RGBA_OUT(d, r, g, b, a);
-#else
-            RGB_OUT(d, r, g, b);
-#endif
-            s += 4;
-            d += BPP;
-        }
-        s += src_wrap;
-        d += dst_wrap;
-    }
-}
-
-static void glue(RGB_NAME, _to_rgb32)(AVPicture *dst, const AVPicture *src,
-                                       int width, int height)
-{
-    const uint8_t *s;
-    uint8_t *d;
-    int src_wrap, dst_wrap, j, y;
-    unsigned int r, g, b;
-#ifdef RGBA_IN
-    unsigned int a;
-#endif
-
-    s = src->data[0];
-    src_wrap = src->linesize[0] - width * BPP;
-
-    d = dst->data[0];
-    dst_wrap = dst->linesize[0] - width * 4;
-
-    for(y=0;y<height;y++) {
-        for(j = 0;j < width; j++) {
-#ifdef RGBA_IN
-            RGBA_IN(r, g, b, a, s);
-            ((uint32_t *)(d))[0] = (a << 24) | (r << 16) | (g << 8) | b;
-#else
-            RGB_IN(r, g, b, s);
-            ((uint32_t *)(d))[0] = (0xff << 24) | (r << 16) | (g << 8) | b;
-#endif
-            d += 4;
-            s += BPP;
-        }
-        s += src_wrap;
-        d += dst_wrap;
-    }
-}
-
-#endif /* !defined(FMT_RGB32) */
-
-#ifndef FMT_RGB24
-
-static void glue(rgb24_to_, RGB_NAME)(AVPicture *dst, const AVPicture *src,
-                                      int width, int height)
-{
-    const uint8_t *s;
-    uint8_t *d;
-    int src_wrap, dst_wrap, j, y;
-    unsigned int r, g, b;
-
-    s = src->data[0];
-    src_wrap = src->linesize[0] - width * 3;
-
-    d = dst->data[0];
-    dst_wrap = dst->linesize[0] - width * BPP;
-
-    for(y=0;y<height;y++) {
-        for(j = 0;j < width; j++) {
-            r = s[0];
-            g = s[1];
-            b = s[2];
-            RGB_OUT(d, r, g, b);
-            s += 3;
-            d += BPP;
-        }
-        s += src_wrap;
-        d += dst_wrap;
-    }
-}
-
-static void glue(RGB_NAME, _to_rgb24)(AVPicture *dst, const AVPicture *src,
-                                      int width, int height)
-{
-    const uint8_t *s;
-    uint8_t *d;
-    int src_wrap, dst_wrap, j, y;
-    unsigned int r, g , b;
-
-    s = src->data[0];
-    src_wrap = src->linesize[0] - width * BPP;
-
-    d = dst->data[0];
-    dst_wrap = dst->linesize[0] - width * 3;
-
-    for(y=0;y<height;y++) {
-        for(j = 0;j < width; j++) {
-            RGB_IN(r, g, b, s)
-            d[0] = r;
-            d[1] = g;
-            d[2] = b;
-            d += 3;
-            s += BPP;
-        }
-        s += src_wrap;
-        d += dst_wrap;
-    }
-}
-
-#endif /* !FMT_RGB24 */
-
-#ifdef FMT_RGB24
-
-static void yuv444p_to_rgb24(AVPicture *dst, const AVPicture *src,
-                             int width, int height)
-{
-    const uint8_t *y1_ptr, *cb_ptr, *cr_ptr;
-    uint8_t *d, *d1;
-    int w, y, cb, cr, r_add, g_add, b_add;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    unsigned int r, g, b;
-
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    for(;height > 0; height --) {
-        d1 = d;
-        for(w = width; w > 0; w--) {
-            YUV_TO_RGB1_CCIR(cb_ptr[0], cr_ptr[0]);
-
-            YUV_TO_RGB2_CCIR(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-            d1 += BPP;
-
-            y1_ptr++;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        d += dst->linesize[0];
-        y1_ptr += src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width;
-        cr_ptr += src->linesize[2] - width;
-    }
-}
-
-static void yuvj444p_to_rgb24(AVPicture *dst, const AVPicture *src,
-                              int width, int height)
-{
-    const uint8_t *y1_ptr, *cb_ptr, *cr_ptr;
-    uint8_t *d, *d1;
-    int w, y, cb, cr, r_add, g_add, b_add;
-    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    unsigned int r, g, b;
-
-    d = dst->data[0];
-    y1_ptr = src->data[0];
-    cb_ptr = src->data[1];
-    cr_ptr = src->data[2];
-    for(;height > 0; height --) {
-        d1 = d;
-        for(w = width; w > 0; w--) {
-            YUV_TO_RGB1(cb_ptr[0], cr_ptr[0]);
-
-            YUV_TO_RGB2(r, g, b, y1_ptr[0]);
-            RGB_OUT(d1, r, g, b);
-            d1 += BPP;
-
-            y1_ptr++;
-            cb_ptr++;
-            cr_ptr++;
-        }
-        d += dst->linesize[0];
-        y1_ptr += src->linesize[0] - width;
-        cb_ptr += src->linesize[1] - width;
-        cr_ptr += src->linesize[2] - width;
-    }
-}
-
-static void rgb24_to_yuv444p(AVPicture *dst, const AVPicture *src,
-                             int width, int height)
-{
-    int src_wrap, x, y;
-    int r, g, b;
-    uint8_t *lum, *cb, *cr;
-    const uint8_t *p;
-
-    lum = dst->data[0];
-    cb = dst->data[1];
-    cr = dst->data[2];
-
-    src_wrap = src->linesize[0] - width * BPP;
-    p = src->data[0];
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-            RGB_IN(r, g, b, p);
-            lum[0] = RGB_TO_Y_CCIR(r, g, b);
-            cb[0] = RGB_TO_U_CCIR(r, g, b, 0);
-            cr[0] = RGB_TO_V_CCIR(r, g, b, 0);
-            p += BPP;
-            cb++;
-            cr++;
-            lum++;
-        }
-        p += src_wrap;
-        lum += dst->linesize[0] - width;
-        cb += dst->linesize[1] - width;
-        cr += dst->linesize[2] - width;
-    }
-}
-
-static void rgb24_to_yuvj420p(AVPicture *dst, const AVPicture *src,
-                              int width, int height)
-{
-    int wrap, wrap3, width2;
-    int r, g, b, r1, g1, b1, w;
-    uint8_t *lum, *cb, *cr;
-    const uint8_t *p;
-
-    lum = dst->data[0];
-    cb = dst->data[1];
-    cr = dst->data[2];
-
-    width2 = (width + 1) >> 1;
-    wrap = dst->linesize[0];
-    wrap3 = src->linesize[0];
-    p = src->data[0];
-    for(;height>=2;height -= 2) {
-        for(w = width; w >= 2; w -= 2) {
-            RGB_IN(r, g, b, p);
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = RGB_TO_Y(r, g, b);
-
-            RGB_IN(r, g, b, p + BPP);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = RGB_TO_Y(r, g, b);
-            p += wrap3;
-            lum += wrap;
-
-            RGB_IN(r, g, b, p);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = RGB_TO_Y(r, g, b);
-
-            RGB_IN(r, g, b, p + BPP);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = RGB_TO_Y(r, g, b);
-
-            cb[0] = RGB_TO_U(r1, g1, b1, 2);
-            cr[0] = RGB_TO_V(r1, g1, b1, 2);
-
-            cb++;
-            cr++;
-            p += -wrap3 + 2 * BPP;
-            lum += -wrap + 2;
-        }
-        if (w) {
-            RGB_IN(r, g, b, p);
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = RGB_TO_Y(r, g, b);
-            p += wrap3;
-            lum += wrap;
-            RGB_IN(r, g, b, p);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[0] = RGB_TO_Y(r, g, b);
-            cb[0] = RGB_TO_U(r1, g1, b1, 1);
-            cr[0] = RGB_TO_V(r1, g1, b1, 1);
-            cb++;
-            cr++;
-            p += -wrap3 + BPP;
-            lum += -wrap + 1;
-        }
-        p += wrap3 + (wrap3 - width * BPP);
-        lum += wrap + (wrap - width);
-        cb += dst->linesize[1] - width2;
-        cr += dst->linesize[2] - width2;
-    }
-    /* handle odd height */
-    if (height) {
-        for(w = width; w >= 2; w -= 2) {
-            RGB_IN(r, g, b, p);
-            r1 = r;
-            g1 = g;
-            b1 = b;
-            lum[0] = RGB_TO_Y(r, g, b);
-
-            RGB_IN(r, g, b, p + BPP);
-            r1 += r;
-            g1 += g;
-            b1 += b;
-            lum[1] = RGB_TO_Y(r, g, b);
-            cb[0] = RGB_TO_U(r1, g1, b1, 1);
-            cr[0] = RGB_TO_V(r1, g1, b1, 1);
-            cb++;
-            cr++;
-            p += 2 * BPP;
-           lum += 2;
-        }
-        if (w) {
-            RGB_IN(r, g, b, p);
-            lum[0] = RGB_TO_Y(r, g, b);
-            cb[0] = RGB_TO_U(r, g, b, 0);
-            cr[0] = RGB_TO_V(r, g, b, 0);
-        }
-    }
-}
-
-static void rgb24_to_yuvj444p(AVPicture *dst, const AVPicture *src,
-                              int width, int height)
-{
-    int src_wrap, x, y;
-    int r, g, b;
-    uint8_t *lum, *cb, *cr;
-    const uint8_t *p;
-
-    lum = dst->data[0];
-    cb = dst->data[1];
-    cr = dst->data[2];
-
-    src_wrap = src->linesize[0] - width * BPP;
-    p = src->data[0];
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-            RGB_IN(r, g, b, p);
-            lum[0] = RGB_TO_Y(r, g, b);
-            cb[0] = RGB_TO_U(r, g, b, 0);
-            cr[0] = RGB_TO_V(r, g, b, 0);
-            p += BPP;
-            cb++;
-            cr++;
-            lum++;
-        }
-        p += src_wrap;
-        lum += dst->linesize[0] - width;
-        cb += dst->linesize[1] - width;
-        cr += dst->linesize[2] - width;
-    }
-}
-
-#endif /* FMT_RGB24 */
-
-#if defined(FMT_RGB24) || defined(FMT_RGB32)
-
-static void glue(RGB_NAME, _to_pal8)(AVPicture *dst, const AVPicture *src,
-                                     int width, int height)
-{
-    const unsigned char *p;
-    unsigned char *q;
-    int dst_wrap, src_wrap;
-    int x, y, has_alpha;
-    unsigned int r, g, b;
-
-    p = src->data[0];
-    src_wrap = src->linesize[0] - BPP * width;
-
-    q = dst->data[0];
-    dst_wrap = dst->linesize[0] - width;
-    has_alpha = 0;
-
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-#ifdef RGBA_IN
-            {
-                unsigned int a;
-                RGBA_IN(r, g, b, a, p);
-                /* crude approximation for alpha ! */
-                if (a < 0x80) {
-                    has_alpha = 1;
-                    q[0] = TRANSP_INDEX;
-                } else {
-                    q[0] = gif_clut_index(r, g, b);
-                }
-            }
-#else
-            RGB_IN(r, g, b, p);
-            q[0] = gif_clut_index(r, g, b);
-#endif
-            q++;
-            p += BPP;
-        }
-        p += src_wrap;
-        q += dst_wrap;
-    }
-
-    build_rgb_palette(dst->data[1], has_alpha);
-}
-
-#endif /* defined(FMT_RGB24) || defined(FMT_RGB32) */
-
-#ifdef RGBA_IN
-
-static int glue(get_alpha_info_, RGB_NAME)(const AVPicture *src,
-                                           int width, int height)
-{
-    const unsigned char *p;
-    int src_wrap, ret, x, y;
-    unsigned int r, g, b, a;
-
-    p = src->data[0];
-    src_wrap = src->linesize[0] - BPP * width;
-    ret = 0;
-    for(y=0;y<height;y++) {
-        for(x=0;x<width;x++) {
-            RGBA_IN(r, g, b, a, p);
-            if (a == 0x00) {
-                ret |= FF_ALPHA_TRANSP;
-            } else if (a != 0xff) {
-                ret |= FF_ALPHA_SEMI_TRANSP;
-            }
-            p += BPP;
-        }
-        p += src_wrap;
-    }
-    return ret;
-}
-
-#endif /* RGBA_IN */
-
-#undef RGB_IN
-#undef RGBA_IN
-#undef RGB_OUT
-#undef RGBA_OUT
-#undef BPP
-#undef RGB_NAME
-#undef FMT_RGB24
-#undef FMT_RGB32