libavcodec/i386/dsputil_h264_template_ssse3.c

   1 /*
   2  * Copyright (c) 2008 Loren Merritt
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /**
  22  * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
  23  * H264_CHROMA_MC8_TMPL must be defined to the desired function name
  24  * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
  25  * AVG_OP must be defined to empty for put and the identify for avg
  26  */
  27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
  28 {
  29     if(y==0 && x==0) {
  30         /* no filter needed */
  31         H264_CHROMA_MC8_MV0(dst, src, stride, h);
  32         return;
  33     }
  34
  35     assert(x<8 && y<8 && x>=0 && y>=0);
  36
  37     if(y==0 || x==0)
  38     {
  39         /* 1 dimensional filter only */
  40         asm volatile(
  41             "movd %0, %%xmm7 \n\t"
  42             "movq %1, %%xmm6 \n\t"
  43             "pshuflw $0, %%xmm7, %%xmm7 \n\t"
  44             "movlhps %%xmm6, %%xmm6 \n\t"
  45             "movlhps %%xmm7, %%xmm7 \n\t"
  46             :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
  47         );
  48
  49         if(x) {
  50             asm volatile(
  51                 "1: \n\t"
  52                 "movq (%1), %%xmm0 \n\t"
  53                 "movq 1(%1), %%xmm1 \n\t"
  54                 "movq (%1,%3), %%xmm2 \n\t"
  55                 "movq 1(%1,%3), %%xmm3 \n\t"
  56                 "punpcklbw %%xmm1, %%xmm0 \n\t"
  57                 "punpcklbw %%xmm3, %%xmm2 \n\t"
  58                 "pmaddubsw %%xmm7, %%xmm0 \n\t"
  59                 "pmaddubsw %%xmm7, %%xmm2 \n\t"
  60          AVG_OP("movq (%0), %%xmm4 \n\t")
  61          AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
  62                 "paddw %%xmm6, %%xmm0 \n\t"
  63                 "paddw %%xmm6, %%xmm2 \n\t"
  64                 "psrlw $3, %%xmm0 \n\t"
  65                 "psrlw $3, %%xmm2 \n\t"
  66                 "packuswb %%xmm2, %%xmm0 \n\t"
  67          AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
  68                 "movq %%xmm0, (%0) \n\t"
  69                 "movhps %%xmm0, (%0,%3) \n\t"
  70                 "sub $2, %2 \n\t"
  71                 "lea (%1,%3,2), %1 \n\t"
  72                 "lea (%0,%3,2), %0 \n\t"
  73                 "jg 1b \n\t"
  74                 :"+r"(dst), "+r"(src), "+r"(h)
  75                 :"r"((x86_reg)stride)
  76             );
  77         } else {
  78             asm volatile(
  79                 "1: \n\t"
  80                 "movq (%1), %%xmm0 \n\t"
  81                 "movq (%1,%3), %%xmm1 \n\t"
  82                 "movdqa %%xmm1, %%xmm2 \n\t"
  83                 "movq (%1,%3,2), %%xmm3 \n\t"
  84                 "punpcklbw %%xmm1, %%xmm0 \n\t"
  85                 "punpcklbw %%xmm3, %%xmm2 \n\t"
  86                 "pmaddubsw %%xmm7, %%xmm0 \n\t"
  87                 "pmaddubsw %%xmm7, %%xmm2 \n\t"
  88          AVG_OP("movq (%0), %%xmm4 \n\t")
  89          AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
  90                 "paddw %%xmm6, %%xmm0 \n\t"
  91                 "paddw %%xmm6, %%xmm2 \n\t"
  92                 "psrlw $3, %%xmm0 \n\t"
  93                 "psrlw $3, %%xmm2 \n\t"
  94                 "packuswb %%xmm2, %%xmm0 \n\t"
  95          AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
  96                 "movq %%xmm0, (%0) \n\t"
  97                 "movhps %%xmm0, (%0,%3) \n\t"
  98                 "sub $2, %2 \n\t"
  99                 "lea (%1,%3,2), %1 \n\t"
 100                 "lea (%0,%3,2), %0 \n\t"
 101                 "jg 1b \n\t"
 102                 :"+r"(dst), "+r"(src), "+r"(h)
 103                 :"r"((x86_reg)stride)
 104             );
 105         }
 106         return;
 107     }
 108
 109     /* general case, bilinear */
 110     asm volatile(
 111         "movd %0, %%xmm7 \n\t"
 112         "movd %1, %%xmm6 \n\t"
 113         "movdqa %2, %%xmm5 \n\t"
 114         "pshuflw $0, %%xmm7, %%xmm7 \n\t"
 115         "pshuflw $0, %%xmm6, %%xmm6 \n\t"
 116         "movlhps %%xmm7, %%xmm7 \n\t"
 117         "movlhps %%xmm6, %%xmm6 \n\t"
 118         :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
 119     );
 120
 121     asm volatile(
 122         "movq (%1), %%xmm0 \n\t"
 123         "movq 1(%1), %%xmm1 \n\t"
 124         "punpcklbw %%xmm1, %%xmm0 \n\t"
 125         "add %3, %1 \n\t"
 126         "1: \n\t"
 127         "movq (%1), %%xmm1 \n\t"
 128         "movq 1(%1), %%xmm2 \n\t"
 129         "movq (%1,%3), %%xmm3 \n\t"
 130         "movq 1(%1,%3), %%xmm4 \n\t"
 131         "lea (%1,%3,2), %1 \n\t"
 132         "punpcklbw %%xmm2, %%xmm1 \n\t"
 133         "punpcklbw %%xmm4, %%xmm3 \n\t"
 134         "movdqa %%xmm1, %%xmm2 \n\t"
 135         "movdqa %%xmm3, %%xmm4 \n\t"
 136         "pmaddubsw %%xmm7, %%xmm0 \n\t"
 137         "pmaddubsw %%xmm6, %%xmm1 \n\t"
 138         "pmaddubsw %%xmm7, %%xmm2 \n\t"
 139         "pmaddubsw %%xmm6, %%xmm3 \n\t"
 140         "paddw %%xmm5, %%xmm0 \n\t"
 141         "paddw %%xmm5, %%xmm2 \n\t"
 142         "paddw %%xmm0, %%xmm1 \n\t"
 143         "paddw %%xmm2, %%xmm3 \n\t"
 144         "movdqa %%xmm4, %%xmm0 \n\t"
 145         "psrlw $6, %%xmm1 \n\t"
 146         "psrlw $6, %%xmm3 \n\t"
 147  AVG_OP("movq (%0), %%xmm2 \n\t")
 148  AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
 149         "packuswb %%xmm3, %%xmm1 \n\t"
 150  AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
 151         "movq %%xmm1, (%0)\n\t"
 152         "movhps %%xmm1, (%0,%3)\n\t"
 153         "sub $2, %2 \n\t"
 154         "lea (%0,%3,2), %0 \n\t"
 155         "jg 1b \n\t"
 156         :"+r"(dst), "+r"(src), "+r"(h)
 157         :"r"((x86_reg)stride)
 158     );
 159 }
 160
 161 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 162 {
 163     asm volatile(
 164         "movd %0, %%mm7 \n\t"
 165         "movd %1, %%mm6 \n\t"
 166         "movq %2, %%mm5 \n\t"
 167         "pshufw $0, %%mm7, %%mm7 \n\t"
 168         "pshufw $0, %%mm6, %%mm6 \n\t"
 169         :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
 170     );
 171
 172     asm volatile(
 173         "movd (%1), %%mm0 \n\t"
 174         "punpcklbw 1(%1), %%mm0 \n\t"
 175         "add %3, %1 \n\t"
 176         "1: \n\t"
 177         "movd (%1), %%mm1 \n\t"
 178         "movd (%1,%3), %%mm3 \n\t"
 179         "punpcklbw 1(%1), %%mm1 \n\t"
 180         "punpcklbw 1(%1,%3), %%mm3 \n\t"
 181         "lea (%1,%3,2), %1 \n\t"
 182         "movq %%mm1, %%mm2 \n\t"
 183         "movq %%mm3, %%mm4 \n\t"
 184         "pmaddubsw %%mm7, %%mm0 \n\t"
 185         "pmaddubsw %%mm6, %%mm1 \n\t"
 186         "pmaddubsw %%mm7, %%mm2 \n\t"
 187         "pmaddubsw %%mm6, %%mm3 \n\t"
 188         "paddw %%mm5, %%mm0 \n\t"
 189         "paddw %%mm5, %%mm2 \n\t"
 190         "paddw %%mm0, %%mm1 \n\t"
 191         "paddw %%mm2, %%mm3 \n\t"
 192         "movq %%mm4, %%mm0 \n\t"
 193         "psrlw $6, %%mm1 \n\t"
 194         "psrlw $6, %%mm3 \n\t"
 195         "packuswb %%mm1, %%mm1 \n\t"
 196         "packuswb %%mm3, %%mm3 \n\t"
 197  AVG_OP("pavgb (%0), %%mm1 \n\t")
 198  AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
 199         "movd %%mm1, (%0)\n\t"
 200         "movd %%mm3, (%0,%3)\n\t"
 201         "sub $2, %2 \n\t"
 202         "lea (%0,%3,2), %0 \n\t"
 203         "jg 1b \n\t"
 204         :"+r"(dst), "+r"(src), "+r"(h)
 205         :"r"((x86_reg)stride)
 206     );
 207 }
 208