libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #if HAVE_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000         av_freep(&hx->rbsp_buffer[1]);
2001         av_freep(&hx->rbsp_buffer[0]);
2002         if (i) av_freep(&h->thread_context[i]);
2003     }
2004 }
2005
2006 static void init_dequant8_coeff_table(H264Context *h){
2007     int i,q,x;
2008     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2009     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2010     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2011
2012     for(i=0; i<2; i++ ){
2013         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2014             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2015             break;
2016         }
2017
2018         for(q=0; q<52; q++){
2019             int shift = div6[q];
2020             int idx = rem6[q];
2021             for(x=0; x<64; x++)
2022                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2023                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2024                     h->pps.scaling_matrix8[i][x]) << shift;
2025         }
2026     }
2027 }
2028
2029 static void init_dequant4_coeff_table(H264Context *h){
2030     int i,j,q,x;
2031     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2032     for(i=0; i<6; i++ ){
2033         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2034         for(j=0; j<i; j++){
2035             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2036                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2037                 break;
2038             }
2039         }
2040         if(j<i)
2041             continue;
2042
2043         for(q=0; q<52; q++){
2044             int shift = div6[q] + 2;
2045             int idx = rem6[q];
2046             for(x=0; x<16; x++)
2047                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2048                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2049                     h->pps.scaling_matrix4[i][x]) << shift;
2050         }
2051     }
2052 }
2053
2054 static void init_dequant_tables(H264Context *h){
2055     int i,x;
2056     init_dequant4_coeff_table(h);
2057     if(h->pps.transform_8x8_mode)
2058         init_dequant8_coeff_table(h);
2059     if(h->sps.transform_bypass){
2060         for(i=0; i<6; i++)
2061             for(x=0; x<16; x++)
2062                 h->dequant4_coeff[i][0][x] = 1<<6;
2063         if(h->pps.transform_8x8_mode)
2064             for(i=0; i<2; i++)
2065                 for(x=0; x<64; x++)
2066                     h->dequant8_coeff[i][0][x] = 1<<6;
2067     }
2068 }
2069
2070
2071 /**
2072  * allocates tables.
2073  * needs width/height
2074  */
2075 static int alloc_tables(H264Context *h){
2076     MpegEncContext * const s = &h->s;
2077     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2078     int x,y;
2079
2080     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t), fail)
2081
2082     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t), fail)
2083     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
2084     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
2085
2086     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
2087     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
2088     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
2089     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
2090
2091     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2092     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2093
2094     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy  , big_mb_num * sizeof(uint32_t), fail);
2095     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b8_xy , big_mb_num * sizeof(uint32_t), fail);
2096     for(y=0; y<s->mb_height; y++){
2097         for(x=0; x<s->mb_width; x++){
2098             const int mb_xy= x + y*s->mb_stride;
2099             const int b_xy = 4*x + 4*y*h->b_stride;
2100             const int b8_xy= 2*x + 2*y*h->b8_stride;
2101
2102             h->mb2b_xy [mb_xy]= b_xy;
2103             h->mb2b8_xy[mb_xy]= b8_xy;
2104         }
2105     }
2106
2107     s->obmc_scratchpad = NULL;
2108
2109     if(!h->dequant4_coeff[0])
2110         init_dequant_tables(h);
2111
2112     return 0;
2113 fail:
2114     free_tables(h);
2115     return -1;
2116 }
2117
2118 /**
2119  * Mimic alloc_tables(), but for every context thread.
2120  */
2121 static void clone_tables(H264Context *dst, H264Context *src){
2122     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2123     dst->non_zero_count           = src->non_zero_count;
2124     dst->slice_table              = src->slice_table;
2125     dst->cbp_table                = src->cbp_table;
2126     dst->mb2b_xy                  = src->mb2b_xy;
2127     dst->mb2b8_xy                 = src->mb2b8_xy;
2128     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2129     dst->mvd_table[0]             = src->mvd_table[0];
2130     dst->mvd_table[1]             = src->mvd_table[1];
2131     dst->direct_table             = src->direct_table;
2132
2133     dst->s.obmc_scratchpad = NULL;
2134     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2135 }
2136
2137 /**
2138  * Init context
2139  * Allocate buffers which are not shared amongst multiple threads.
2140  */
2141 static int context_init(H264Context *h){
2142     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2143     FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
2144
2145     return 0;
2146 fail:
2147     return -1; // free_tables will clean up for us
2148 }
2149
2150 static av_cold void common_init(H264Context *h){
2151     MpegEncContext * const s = &h->s;
2152
2153     s->width = s->avctx->width;
2154     s->height = s->avctx->height;
2155     s->codec_id= s->avctx->codec->id;
2156
2157     ff_h264_pred_init(&h->hpc, s->codec_id);
2158
2159     h->dequant_coeff_pps= -1;
2160     s->unrestricted_mv=1;
2161     s->decode=1; //FIXME
2162
2163     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2164
2165     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2166     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2167 }
2168
2169 /**
2170  * Reset SEI values at the beginning of the frame.
2171  *
2172  * @param h H.264 context.
2173  */
2174 static void reset_sei(H264Context *h) {
2175     h->sei_recovery_frame_cnt       = -1;
2176     h->sei_dpb_output_delay         =  0;
2177     h->sei_cpb_removal_delay        = -1;
2178     h->sei_buffering_period_present =  0;
2179 }
2180
2181 static av_cold int decode_init(AVCodecContext *avctx){
2182     H264Context *h= avctx->priv_data;
2183     MpegEncContext * const s = &h->s;
2184
2185     MPV_decode_defaults(s);
2186
2187     s->avctx = avctx;
2188     common_init(h);
2189
2190     s->out_format = FMT_H264;
2191     s->workaround_bugs= avctx->workaround_bugs;
2192
2193     // set defaults
2194 //    s->decode_mb= ff_h263_decode_mb;
2195     s->quarter_sample = 1;
2196     if(!avctx->has_b_frames)
2197     s->low_delay= 1;
2198
2199     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2200         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2201     else
2202         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2203     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2204     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
2205
2206     decode_init_vlc();
2207
2208     if(avctx->extradata_size > 0 && avctx->extradata &&
2209        *(char *)avctx->extradata == 1){
2210         h->is_avc = 1;
2211         h->got_avcC = 0;
2212     } else {
2213         h->is_avc = 0;
2214     }
2215
2216     h->thread_context[0] = h;
2217     h->outputed_poc = INT_MIN;
2218     h->prev_poc_msb= 1<<16;
2219     reset_sei(h);
2220     if(avctx->codec_id == CODEC_ID_H264){
2221         if(avctx->ticks_per_frame == 1){
2222             s->avctx->time_base.den *=2;
2223         }
2224         avctx->ticks_per_frame = 2;
2225     }
2226     return 0;
2227 }
2228
2229 static int frame_start(H264Context *h){
2230     MpegEncContext * const s = &h->s;
2231     int i;
2232
2233     if(MPV_frame_start(s, s->avctx) < 0)
2234         return -1;
2235     ff_er_frame_start(s);
2236     /*
2237      * MPV_frame_start uses pict_type to derive key_frame.
2238      * This is incorrect for H.264; IDR markings must be used.
2239      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2240      * See decode_nal_units().
2241      */
2242     s->current_picture_ptr->key_frame= 0;
2243     s->current_picture_ptr->mmco_reset= 0;
2244
2245     assert(s->linesize && s->uvlinesize);
2246
2247     for(i=0; i<16; i++){
2248         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2249         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2250     }
2251     for(i=0; i<4; i++){
2252         h->block_offset[16+i]=
2253         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2254         h->block_offset[24+16+i]=
2255         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2256     }
2257
2258     /* can't be in alloc_tables because linesize isn't known there.
2259      * FIXME: redo bipred weight to not require extra buffer? */
2260     for(i = 0; i < s->avctx->thread_count; i++)
2261         if(!h->thread_context[i]->s.obmc_scratchpad)
2262             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2263
2264     /* some macroblocks will be accessed before they're available */
2265     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2266         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2267
2268 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2269
2270     // We mark the current picture as non-reference after allocating it, so
2271     // that if we break out due to an error it can be released automatically
2272     // in the next MPV_frame_start().
2273     // SVQ3 as well as most other codecs have only last/next/current and thus
2274     // get released even with set reference, besides SVQ3 and others do not
2275     // mark frames as reference later "naturally".
2276     if(s->codec_id != CODEC_ID_SVQ3)
2277         s->current_picture_ptr->reference= 0;
2278
2279     s->current_picture_ptr->field_poc[0]=
2280     s->current_picture_ptr->field_poc[1]= INT_MAX;
2281     assert(s->current_picture_ptr->long_ref==0);
2282
2283     return 0;
2284 }
2285
2286 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2287     MpegEncContext * const s = &h->s;
2288     int i;
2289     int step    = 1;
2290     int offset  = 1;
2291     int uvoffset= 1;
2292     int top_idx = 1;
2293     int skiplast= 0;
2294
2295     src_y  -=   linesize;
2296     src_cb -= uvlinesize;
2297     src_cr -= uvlinesize;
2298
2299     if(!simple && FRAME_MBAFF){
2300         if(s->mb_y&1){
2301             offset  = MB_MBAFF ? 1 : 17;
2302             uvoffset= MB_MBAFF ? 1 : 9;
2303             if(!MB_MBAFF){
2304                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2305                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2306                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2307                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2308                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2309                 }
2310             }
2311         }else{
2312             if(!MB_MBAFF){
2313                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2314                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2315                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2316                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2317                 }
2318                 skiplast= 1;
2319             }
2320             offset  =
2321             uvoffset=
2322             top_idx = MB_MBAFF ? 0 : 1;
2323         }
2324         step= MB_MBAFF ? 2 : 1;
2325     }
2326
2327     // There are two lines saved, the line above the the top macroblock of a pair,
2328     // and the line above the bottom macroblock
2329     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2330     for(i=1; i<17 - skiplast; i++){
2331         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2332     }
2333
2334     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2335     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2336
2337     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2338         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2339         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2340         for(i=1; i<9 - skiplast; i++){
2341             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2342             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2343         }
2344         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2345         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2346     }
2347 }
2348
2349 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2350     MpegEncContext * const s = &h->s;
2351     int temp8, i;
2352     uint64_t temp64;
2353     int deblock_left;
2354     int deblock_top;
2355     int mb_xy;
2356     int step    = 1;
2357     int offset  = 1;
2358     int uvoffset= 1;
2359     int top_idx = 1;
2360
2361     if(!simple && FRAME_MBAFF){
2362         if(s->mb_y&1){
2363             offset  = MB_MBAFF ? 1 : 17;
2364             uvoffset= MB_MBAFF ? 1 : 9;
2365         }else{
2366             offset  =
2367             uvoffset=
2368             top_idx = MB_MBAFF ? 0 : 1;
2369         }
2370         step= MB_MBAFF ? 2 : 1;
2371     }
2372
2373     if(h->deblocking_filter == 2) {
2374         mb_xy = h->mb_xy;
2375         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2376         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2377     } else {
2378         deblock_left = (s->mb_x > 0);
2379         deblock_top =  (s->mb_y > !!MB_FIELD);
2380     }
2381
2382     src_y  -=   linesize + 1;
2383     src_cb -= uvlinesize + 1;
2384     src_cr -= uvlinesize + 1;
2385
2386 #define XCHG(a,b,t,xchg)\
2387 t= a;\
2388 if(xchg)\
2389     a= b;\
2390 b= t;
2391
2392     if(deblock_left){
2393         for(i = !deblock_top; i<16; i++){
2394             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2395         }
2396         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2397     }
2398
2399     if(deblock_top){
2400         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2401         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2402         if(s->mb_x+1 < s->mb_width){
2403             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2404         }
2405     }
2406
2407     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2408         if(deblock_left){
2409             for(i = !deblock_top; i<8; i++){
2410                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2411                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2412             }
2413             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2414             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2415         }
2416         if(deblock_top){
2417             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2418             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2419         }
2420     }
2421 }
2422
2423 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2424     MpegEncContext * const s = &h->s;
2425     const int mb_x= s->mb_x;
2426     const int mb_y= s->mb_y;
2427     const int mb_xy= h->mb_xy;
2428     const int mb_type= s->current_picture.mb_type[mb_xy];
2429     uint8_t  *dest_y, *dest_cb, *dest_cr;
2430     int linesize, uvlinesize /*dct_offset*/;
2431     int i;
2432     int *block_offset = &h->block_offset[0];
2433     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2434     /* is_h264 should always be true if SVQ3 is disabled. */
2435     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2436     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2437     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2438
2439     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2440     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2441     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2442
2443     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2444     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2445
2446     if (!simple && MB_FIELD) {
2447         linesize   = h->mb_linesize   = s->linesize * 2;
2448         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2449         block_offset = &h->block_offset[24];
2450         if(mb_y&1){ //FIXME move out of this function?
2451             dest_y -= s->linesize*15;
2452             dest_cb-= s->uvlinesize*7;
2453             dest_cr-= s->uvlinesize*7;
2454         }
2455         if(FRAME_MBAFF) {
2456             int list;
2457             for(list=0; list<h->list_count; list++){
2458                 if(!USES_LIST(mb_type, list))
2459                     continue;
2460                 if(IS_16X16(mb_type)){
2461                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2462                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2463                 }else{
2464                     for(i=0; i<16; i+=4){
2465                         int ref = h->ref_cache[list][scan8[i]];
2466                         if(ref >= 0)
2467                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2468                     }
2469                 }
2470             }
2471         }
2472     } else {
2473         linesize   = h->mb_linesize   = s->linesize;
2474         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2475 //        dct_offset = s->linesize * 16;
2476     }
2477
2478     if (!simple && IS_INTRA_PCM(mb_type)) {
2479         for (i=0; i<16; i++) {
2480             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2481         }
2482         for (i=0; i<8; i++) {
2483             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2484             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2485         }
2486     } else {
2487         if(IS_INTRA(mb_type)){
2488             if(h->deblocking_filter)
2489                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2490
2491             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2492                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2493                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2494             }
2495
2496             if(IS_INTRA4x4(mb_type)){
2497                 if(simple || !s->encoding){
2498                     if(IS_8x8DCT(mb_type)){
2499                         if(transform_bypass){
2500                             idct_dc_add =
2501                             idct_add    = s->dsp.add_pixels8;
2502                         }else{
2503                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2504                             idct_add    = s->dsp.h264_idct8_add;
2505                         }
2506                         for(i=0; i<16; i+=4){
2507                             uint8_t * const ptr= dest_y + block_offset[i];
2508                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2509                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2510                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2511                             }else{
2512                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2513                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2514                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2515                                 if(nnz){
2516                                     if(nnz == 1 && h->mb[i*16])
2517                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2518                                     else
2519                                         idct_add   (ptr, h->mb + i*16, linesize);
2520                                 }
2521                             }
2522                         }
2523                     }else{
2524                         if(transform_bypass){
2525                             idct_dc_add =
2526                             idct_add    = s->dsp.add_pixels4;
2527                         }else{
2528                             idct_dc_add = s->dsp.h264_idct_dc_add;
2529                             idct_add    = s->dsp.h264_idct_add;
2530                         }
2531                         for(i=0; i<16; i++){
2532                             uint8_t * const ptr= dest_y + block_offset[i];
2533                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2534
2535                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2536                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2537                             }else{
2538                                 uint8_t *topright;
2539                                 int nnz, tr;
2540                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2541                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2542                                     assert(mb_y || linesize <= block_offset[i]);
2543                                     if(!topright_avail){
2544                                         tr= ptr[3 - linesize]*0x01010101;
2545                                         topright= (uint8_t*) &tr;
2546                                     }else
2547                                         topright= ptr + 4 - linesize;
2548                                 }else
2549                                     topright= NULL;
2550
2551                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2552                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2553                                 if(nnz){
2554                                     if(is_h264){
2555                                         if(nnz == 1 && h->mb[i*16])
2556                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2557                                         else
2558                                             idct_add   (ptr, h->mb + i*16, linesize);
2559                                     }else
2560                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2561                                 }
2562                             }
2563                         }
2564                     }
2565                 }
2566             }else{
2567                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2568                 if(is_h264){
2569                     if(!transform_bypass)
2570                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2571                 }else
2572                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2573             }
2574             if(h->deblocking_filter)
2575                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2576         }else if(is_h264){
2577             hl_motion(h, dest_y, dest_cb, dest_cr,
2578                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2579                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2580                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2581         }
2582
2583
2584         if(!IS_INTRA4x4(mb_type)){
2585             if(is_h264){
2586                 if(IS_INTRA16x16(mb_type)){
2587                     if(transform_bypass){
2588                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2589                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2590                         }else{
2591                             for(i=0; i<16; i++){
2592                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2593                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2594                             }
2595                         }
2596                     }else{
2597                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2598                     }
2599                 }else if(h->cbp&15){
2600                     if(transform_bypass){
2601                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2602                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2603                         for(i=0; i<16; i+=di){
2604                             if(h->non_zero_count_cache[ scan8[i] ]){
2605                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2606                             }
2607                         }
2608                     }else{
2609                         if(IS_8x8DCT(mb_type)){
2610                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2611                         }else{
2612                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2613                         }
2614                     }
2615                 }
2616             }else{
2617                 for(i=0; i<16; i++){
2618                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2619                         uint8_t * const ptr= dest_y + block_offset[i];
2620                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2621                     }
2622                 }
2623             }
2624         }
2625
2626         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2627             uint8_t *dest[2] = {dest_cb, dest_cr};
2628             if(transform_bypass){
2629                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2630                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2631                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2632                 }else{
2633                     idct_add = s->dsp.add_pixels4;
2634                     for(i=16; i<16+8; i++){
2635                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2636                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2637                     }
2638                 }
2639             }else{
2640                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2641                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2642                 if(is_h264){
2643                     idct_add = s->dsp.h264_idct_add;
2644                     idct_dc_add = s->dsp.h264_idct_dc_add;
2645                     for(i=16; i<16+8; i++){
2646                         if(h->non_zero_count_cache[ scan8[i] ])
2647                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2648                         else if(h->mb[i*16])
2649                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2650                     }
2651                 }else{
2652                     for(i=16; i<16+8; i++){
2653                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2654                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2655                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2656                         }
2657                     }
2658                 }
2659             }
2660         }
2661     }
2662     if(h->cbp || IS_INTRA(mb_type))
2663         s->dsp.clear_blocks(h->mb);
2664
2665     if(h->deblocking_filter) {
2666         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2667         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2668         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2669         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2670         if (!simple && FRAME_MBAFF) {
2671             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2672         } else {
2673             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2674         }
2675     }
2676 }
2677
2678 /**
2679  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2680  */
2681 static void hl_decode_mb_simple(H264Context *h){
2682     hl_decode_mb_internal(h, 1);
2683 }
2684
2685 /**
2686  * Process a macroblock; this handles edge cases, such as interlacing.
2687  */
2688 static void av_noinline hl_decode_mb_complex(H264Context *h){
2689     hl_decode_mb_internal(h, 0);
2690 }
2691
2692 static void hl_decode_mb(H264Context *h){
2693     MpegEncContext * const s = &h->s;
2694     const int mb_xy= h->mb_xy;
2695     const int mb_type= s->current_picture.mb_type[mb_xy];
2696     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2697
2698     if (is_complex)
2699         hl_decode_mb_complex(h);
2700     else hl_decode_mb_simple(h);
2701 }
2702
2703 static void pic_as_field(Picture *pic, const int parity){
2704     int i;
2705     for (i = 0; i < 4; ++i) {
2706         if (parity == PICT_BOTTOM_FIELD)
2707             pic->data[i] += pic->linesize[i];
2708         pic->reference = parity;
2709         pic->linesize[i] *= 2;
2710     }
2711     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2712 }
2713
2714 static int split_field_copy(Picture *dest, Picture *src,
2715                             int parity, int id_add){
2716     int match = !!(src->reference & parity);
2717
2718     if (match) {
2719         *dest = *src;
2720         if(parity != PICT_FRAME){
2721             pic_as_field(dest, parity);
2722             dest->pic_id *= 2;
2723             dest->pic_id += id_add;
2724         }
2725     }
2726
2727     return match;
2728 }
2729
2730 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2731     int i[2]={0};
2732     int index=0;
2733
2734     while(i[0]<len || i[1]<len){
2735         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2736             i[0]++;
2737         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2738             i[1]++;
2739         if(i[0] < len){
2740             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2741             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2742         }
2743         if(i[1] < len){
2744             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2745             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2746         }
2747     }
2748
2749     return index;
2750 }
2751
2752 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2753     int i, best_poc;
2754     int out_i= 0;
2755
2756     for(;;){
2757         best_poc= dir ? INT_MIN : INT_MAX;
2758
2759         for(i=0; i<len; i++){
2760             const int poc= src[i]->poc;
2761             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2762                 best_poc= poc;
2763                 sorted[out_i]= src[i];
2764             }
2765         }
2766         if(best_poc == (dir ? INT_MIN : INT_MAX))
2767             break;
2768         limit= sorted[out_i++]->poc - dir;
2769     }
2770     return out_i;
2771 }
2772
2773 /**
2774  * fills the default_ref_list.
2775  */
2776 static int fill_default_ref_list(H264Context *h){
2777     MpegEncContext * const s = &h->s;
2778     int i, len;
2779
2780     if(h->slice_type_nos==FF_B_TYPE){
2781         Picture *sorted[32];
2782         int cur_poc, list;
2783         int lens[2];
2784
2785         if(FIELD_PICTURE)
2786             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2787         else
2788             cur_poc= s->current_picture_ptr->poc;
2789
2790         for(list= 0; list<2; list++){
2791             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2792             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2793             assert(len<=32);
2794             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2795             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2796             assert(len<=32);
2797
2798             if(len < h->ref_count[list])
2799                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2800             lens[list]= len;
2801         }
2802
2803         if(lens[0] == lens[1] && lens[1] > 1){
2804             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2805             if(i == lens[0])
2806                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2807         }
2808     }else{
2809         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2810         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2811         assert(len <= 32);
2812         if(len < h->ref_count[0])
2813             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2814     }
2815 #ifdef TRACE
2816     for (i=0; i<h->ref_count[0]; i++) {
2817         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2818     }
2819     if(h->slice_type_nos==FF_B_TYPE){
2820         for (i=0; i<h->ref_count[1]; i++) {
2821             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2822         }
2823     }
2824 #endif
2825     return 0;
2826 }
2827
2828 static void print_short_term(H264Context *h);
2829 static void print_long_term(H264Context *h);
2830
2831 /**
2832  * Extract structure information about the picture described by pic_num in
2833  * the current decoding context (frame or field). Note that pic_num is
2834  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2835  * @param pic_num picture number for which to extract structure information
2836  * @param structure one of PICT_XXX describing structure of picture
2837  *                      with pic_num
2838  * @return frame number (short term) or long term index of picture
2839  *         described by pic_num
2840  */
2841 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2842     MpegEncContext * const s = &h->s;
2843
2844     *structure = s->picture_structure;
2845     if(FIELD_PICTURE){
2846         if (!(pic_num & 1))
2847             /* opposite field */
2848             *structure ^= PICT_FRAME;
2849         pic_num >>= 1;
2850     }
2851
2852     return pic_num;
2853 }
2854
2855 static int decode_ref_pic_list_reordering(H264Context *h){
2856     MpegEncContext * const s = &h->s;
2857     int list, index, pic_structure;
2858
2859     print_short_term(h);
2860     print_long_term(h);
2861
2862     for(list=0; list<h->list_count; list++){
2863         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2864
2865         if(get_bits1(&s->gb)){
2866             int pred= h->curr_pic_num;
2867
2868             for(index=0; ; index++){
2869                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2870                 unsigned int pic_id;
2871                 int i;
2872                 Picture *ref = NULL;
2873
2874                 if(reordering_of_pic_nums_idc==3)
2875                     break;
2876
2877                 if(index >= h->ref_count[list]){
2878                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2879                     return -1;
2880                 }
2881
2882                 if(reordering_of_pic_nums_idc<3){
2883                     if(reordering_of_pic_nums_idc<2){
2884                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2885                         int frame_num;
2886
2887                         if(abs_diff_pic_num > h->max_pic_num){
2888                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2889                             return -1;
2890                         }
2891
2892                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2893                         else                                pred+= abs_diff_pic_num;
2894                         pred &= h->max_pic_num - 1;
2895
2896                         frame_num = pic_num_extract(h, pred, &pic_structure);
2897
2898                         for(i= h->short_ref_count-1; i>=0; i--){
2899                             ref = h->short_ref[i];
2900                             assert(ref->reference);
2901                             assert(!ref->long_ref);
2902                             if(
2903                                    ref->frame_num == frame_num &&
2904                                    (ref->reference & pic_structure)
2905                               )
2906                                 break;
2907                         }
2908                         if(i>=0)
2909                             ref->pic_id= pred;
2910                     }else{
2911                         int long_idx;
2912                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2913
2914                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2915
2916                         if(long_idx>31){
2917                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2918                             return -1;
2919                         }
2920                         ref = h->long_ref[long_idx];
2921                         assert(!(ref && !ref->reference));
2922                         if(ref && (ref->reference & pic_structure)){
2923                             ref->pic_id= pic_id;
2924                             assert(ref->long_ref);
2925                             i=0;
2926                         }else{
2927                             i=-1;
2928                         }
2929                     }
2930
2931                     if (i < 0) {
2932                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2933                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2934                     } else {
2935                         for(i=index; i+1<h->ref_count[list]; i++){
2936                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2937                                 break;
2938                         }
2939                         for(; i > index; i--){
2940                             h->ref_list[list][i]= h->ref_list[list][i-1];
2941                         }
2942                         h->ref_list[list][index]= *ref;
2943                         if (FIELD_PICTURE){
2944                             pic_as_field(&h->ref_list[list][index], pic_structure);
2945                         }
2946                     }
2947                 }else{
2948                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2949                     return -1;
2950                 }
2951             }
2952         }
2953     }
2954     for(list=0; list<h->list_count; list++){
2955         for(index= 0; index < h->ref_count[list]; index++){
2956             if(!h->ref_list[list][index].data[0]){
2957                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2958                 if(h->default_ref_list[list][0].data[0])
2959                     h->ref_list[list][index]= h->default_ref_list[list][0];
2960                 else
2961                     return -1;
2962             }
2963         }
2964     }
2965
2966     return 0;
2967 }
2968
2969 static void fill_mbaff_ref_list(H264Context *h){
2970     int list, i, j;
2971     for(list=0; list<2; list++){ //FIXME try list_count
2972         for(i=0; i<h->ref_count[list]; i++){
2973             Picture *frame = &h->ref_list[list][i];
2974             Picture *field = &h->ref_list[list][16+2*i];
2975             field[0] = *frame;
2976             for(j=0; j<3; j++)
2977                 field[0].linesize[j] <<= 1;
2978             field[0].reference = PICT_TOP_FIELD;
2979             field[0].poc= field[0].field_poc[0];
2980             field[1] = field[0];
2981             for(j=0; j<3; j++)
2982                 field[1].data[j] += frame->linesize[j];
2983             field[1].reference = PICT_BOTTOM_FIELD;
2984             field[1].poc= field[1].field_poc[1];
2985
2986             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2987             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2988             for(j=0; j<2; j++){
2989                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2990                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2991             }
2992         }
2993     }
2994     for(j=0; j<h->ref_count[1]; j++){
2995         for(i=0; i<h->ref_count[0]; i++)
2996             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2997         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2998         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2999     }
3000 }
3001
3002 static int pred_weight_table(H264Context *h){
3003     MpegEncContext * const s = &h->s;
3004     int list, i;
3005     int luma_def, chroma_def;
3006
3007     h->use_weight= 0;
3008     h->use_weight_chroma= 0;
3009     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3010     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3011     luma_def = 1<<h->luma_log2_weight_denom;
3012     chroma_def = 1<<h->chroma_log2_weight_denom;
3013
3014     for(list=0; list<2; list++){
3015         h->luma_weight_flag[list]   = 0;
3016         h->chroma_weight_flag[list] = 0;
3017         for(i=0; i<h->ref_count[list]; i++){
3018             int luma_weight_flag, chroma_weight_flag;
3019
3020             luma_weight_flag= get_bits1(&s->gb);
3021             if(luma_weight_flag){
3022                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3023                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3024                 if(   h->luma_weight[list][i] != luma_def
3025                    || h->luma_offset[list][i] != 0) {
3026                     h->use_weight= 1;
3027                     h->luma_weight_flag[list]= 1;
3028                 }
3029             }else{
3030                 h->luma_weight[list][i]= luma_def;
3031                 h->luma_offset[list][i]= 0;
3032             }
3033
3034             if(CHROMA){
3035                 chroma_weight_flag= get_bits1(&s->gb);
3036                 if(chroma_weight_flag){
3037                     int j;
3038                     for(j=0; j<2; j++){
3039                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3040                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3041                         if(   h->chroma_weight[list][i][j] != chroma_def
3042                            || h->chroma_offset[list][i][j] != 0) {
3043                             h->use_weight_chroma= 1;
3044                             h->chroma_weight_flag[list]= 1;
3045                         }
3046                     }
3047                 }else{
3048                     int j;
3049                     for(j=0; j<2; j++){
3050                         h->chroma_weight[list][i][j]= chroma_def;
3051                         h->chroma_offset[list][i][j]= 0;
3052                     }
3053                 }
3054             }
3055         }
3056         if(h->slice_type_nos != FF_B_TYPE) break;
3057     }
3058     h->use_weight= h->use_weight || h->use_weight_chroma;
3059     return 0;
3060 }
3061
3062 static void implicit_weight_table(H264Context *h){
3063     MpegEncContext * const s = &h->s;
3064     int ref0, ref1, i;
3065     int cur_poc = s->current_picture_ptr->poc;
3066
3067     for (i = 0; i < 2; i++) {
3068         h->luma_weight_flag[i]   = 0;
3069         h->chroma_weight_flag[i] = 0;
3070     }
3071
3072     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3073        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3074         h->use_weight= 0;
3075         h->use_weight_chroma= 0;
3076         return;
3077     }
3078
3079     h->use_weight= 2;
3080     h->use_weight_chroma= 2;
3081     h->luma_log2_weight_denom= 5;
3082     h->chroma_log2_weight_denom= 5;
3083
3084     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3085         int poc0 = h->ref_list[0][ref0].poc;
3086         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3087             int poc1 = h->ref_list[1][ref1].poc;
3088             int td = av_clip(poc1 - poc0, -128, 127);
3089             if(td){
3090                 int tb = av_clip(cur_poc - poc0, -128, 127);
3091                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3092                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3093                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3094                     h->implicit_weight[ref0][ref1] = 32;
3095                 else
3096                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3097             }else
3098                 h->implicit_weight[ref0][ref1] = 32;
3099         }
3100     }
3101 }
3102
3103 /**
3104  * Mark a picture as no longer needed for reference. The refmask
3105  * argument allows unreferencing of individual fields or the whole frame.
3106  * If the picture becomes entirely unreferenced, but is being held for
3107  * display purposes, it is marked as such.
3108  * @param refmask mask of fields to unreference; the mask is bitwise
3109  *                anded with the reference marking of pic
3110  * @return non-zero if pic becomes entirely unreferenced (except possibly
3111  *         for display purposes) zero if one of the fields remains in
3112  *         reference
3113  */
3114 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3115     int i;
3116     if (pic->reference &= refmask) {
3117         return 0;
3118     } else {
3119         for(i = 0; h->delayed_pic[i]; i++)
3120             if(pic == h->delayed_pic[i]){
3121                 pic->reference=DELAYED_PIC_REF;
3122                 break;
3123             }
3124         return 1;
3125     }
3126 }
3127
3128 /**
3129  * instantaneous decoder refresh.
3130  */
3131 static void idr(H264Context *h){
3132     int i;
3133
3134     for(i=0; i<16; i++){
3135         remove_long(h, i, 0);
3136     }
3137     assert(h->long_ref_count==0);
3138
3139     for(i=0; i<h->short_ref_count; i++){
3140         unreference_pic(h, h->short_ref[i], 0);
3141         h->short_ref[i]= NULL;
3142     }
3143     h->short_ref_count=0;
3144     h->prev_frame_num= 0;
3145     h->prev_frame_num_offset= 0;
3146     h->prev_poc_msb=
3147     h->prev_poc_lsb= 0;
3148 }
3149
3150 /* forget old pics after a seek */
3151 static void flush_dpb(AVCodecContext *avctx){
3152     H264Context *h= avctx->priv_data;
3153     int i;
3154     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3155         if(h->delayed_pic[i])
3156             h->delayed_pic[i]->reference= 0;
3157         h->delayed_pic[i]= NULL;
3158     }
3159     h->outputed_poc= INT_MIN;
3160     h->prev_interlaced_frame = 1;
3161     idr(h);
3162     if(h->s.current_picture_ptr)
3163         h->s.current_picture_ptr->reference= 0;
3164     h->s.first_field= 0;
3165     reset_sei(h);
3166     ff_mpeg_flush(avctx);
3167 }
3168
3169 /**
3170  * Find a Picture in the short term reference list by frame number.
3171  * @param frame_num frame number to search for
3172  * @param idx the index into h->short_ref where returned picture is found
3173  *            undefined if no picture found.
3174  * @return pointer to the found picture, or NULL if no pic with the provided
3175  *                 frame number is found
3176  */
3177 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3178     MpegEncContext * const s = &h->s;
3179     int i;
3180
3181     for(i=0; i<h->short_ref_count; i++){
3182         Picture *pic= h->short_ref[i];
3183         if(s->avctx->debug&FF_DEBUG_MMCO)
3184             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3185         if(pic->frame_num == frame_num) {
3186             *idx = i;
3187             return pic;
3188         }
3189     }
3190     return NULL;
3191 }
3192
3193 /**
3194  * Remove a picture from the short term reference list by its index in
3195  * that list.  This does no checking on the provided index; it is assumed
3196  * to be valid. Other list entries are shifted down.
3197  * @param i index into h->short_ref of picture to remove.
3198  */
3199 static void remove_short_at_index(H264Context *h, int i){
3200     assert(i >= 0 && i < h->short_ref_count);
3201     h->short_ref[i]= NULL;
3202     if (--h->short_ref_count)
3203         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3204 }
3205
3206 /**
3207  *
3208  * @return the removed picture or NULL if an error occurs
3209  */
3210 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3211     MpegEncContext * const s = &h->s;
3212     Picture *pic;
3213     int i;
3214
3215     if(s->avctx->debug&FF_DEBUG_MMCO)
3216         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3217
3218     pic = find_short(h, frame_num, &i);
3219     if (pic){
3220         if(unreference_pic(h, pic, ref_mask))
3221         remove_short_at_index(h, i);
3222     }
3223
3224     return pic;
3225 }
3226
3227 /**
3228  * Remove a picture from the long term reference list by its index in
3229  * that list.
3230  * @return the removed picture or NULL if an error occurs
3231  */
3232 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3233     Picture *pic;
3234
3235     pic= h->long_ref[i];
3236     if (pic){
3237         if(unreference_pic(h, pic, ref_mask)){
3238             assert(h->long_ref[i]->long_ref == 1);
3239             h->long_ref[i]->long_ref= 0;
3240             h->long_ref[i]= NULL;
3241             h->long_ref_count--;
3242         }
3243     }
3244
3245     return pic;
3246 }
3247
3248 /**
3249  * print short term list
3250  */
3251 static void print_short_term(H264Context *h) {
3252     uint32_t i;
3253     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3254         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3255         for(i=0; i<h->short_ref_count; i++){
3256             Picture *pic= h->short_ref[i];
3257             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3258         }
3259     }
3260 }
3261
3262 /**
3263  * print long term list
3264  */
3265 static void print_long_term(H264Context *h) {
3266     uint32_t i;
3267     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3268         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3269         for(i = 0; i < 16; i++){
3270             Picture *pic= h->long_ref[i];
3271             if (pic) {
3272                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3273             }
3274         }
3275     }
3276 }
3277
3278 /**
3279  * Executes the reference picture marking (memory management control operations).
3280  */
3281 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3282     MpegEncContext * const s = &h->s;
3283     int i, av_uninit(j);
3284     int current_ref_assigned=0;
3285     Picture *av_uninit(pic);
3286
3287     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3288         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3289
3290     for(i=0; i<mmco_count; i++){
3291         int av_uninit(structure), av_uninit(frame_num);
3292         if(s->avctx->debug&FF_DEBUG_MMCO)
3293             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3294
3295         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3296            || mmco[i].opcode == MMCO_SHORT2LONG){
3297             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3298             pic = find_short(h, frame_num, &j);
3299             if(!pic){
3300                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3301                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3302                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3303                 continue;
3304             }
3305         }
3306
3307         switch(mmco[i].opcode){
3308         case MMCO_SHORT2UNUSED:
3309             if(s->avctx->debug&FF_DEBUG_MMCO)
3310                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3311             remove_short(h, frame_num, structure ^ PICT_FRAME);
3312             break;
3313         case MMCO_SHORT2LONG:
3314                 if (h->long_ref[mmco[i].long_arg] != pic)
3315                     remove_long(h, mmco[i].long_arg, 0);
3316
3317                 remove_short_at_index(h, j);
3318                 h->long_ref[ mmco[i].long_arg ]= pic;
3319                 if (h->long_ref[ mmco[i].long_arg ]){
3320                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3321                     h->long_ref_count++;
3322                 }
3323             break;
3324         case MMCO_LONG2UNUSED:
3325             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3326             pic = h->long_ref[j];
3327             if (pic) {
3328                 remove_long(h, j, structure ^ PICT_FRAME);
3329             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3330                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3331             break;
3332         case MMCO_LONG:
3333                     // Comment below left from previous code as it is an interresting note.
3334                     /* First field in pair is in short term list or
3335                      * at a different long term index.
3336                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3337                      * Report the problem and keep the pair where it is,
3338                      * and mark this field valid.
3339                      */
3340
3341             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3342                 remove_long(h, mmco[i].long_arg, 0);
3343
3344                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3345                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3346                 h->long_ref_count++;
3347             }
3348
3349             s->current_picture_ptr->reference |= s->picture_structure;
3350             current_ref_assigned=1;
3351             break;
3352         case MMCO_SET_MAX_LONG:
3353             assert(mmco[i].long_arg <= 16);
3354             // just remove the long term which index is greater than new max
3355             for(j = mmco[i].long_arg; j<16; j++){
3356                 remove_long(h, j, 0);
3357             }
3358             break;
3359         case MMCO_RESET:
3360             while(h->short_ref_count){
3361                 remove_short(h, h->short_ref[0]->frame_num, 0);
3362             }
3363             for(j = 0; j < 16; j++) {
3364                 remove_long(h, j, 0);
3365             }
3366             s->current_picture_ptr->poc=
3367             s->current_picture_ptr->field_poc[0]=
3368             s->current_picture_ptr->field_poc[1]=
3369             h->poc_lsb=
3370             h->poc_msb=
3371             h->frame_num=
3372             s->current_picture_ptr->frame_num= 0;
3373             s->current_picture_ptr->mmco_reset=1;
3374             break;
3375         default: assert(0);
3376         }
3377     }
3378
3379     if (!current_ref_assigned) {
3380         /* Second field of complementary field pair; the first field of
3381          * which is already referenced. If short referenced, it
3382          * should be first entry in short_ref. If not, it must exist
3383          * in long_ref; trying to put it on the short list here is an
3384          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3385          */
3386         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3387             /* Just mark the second field valid */
3388             s->current_picture_ptr->reference = PICT_FRAME;
3389         } else if (s->current_picture_ptr->long_ref) {
3390             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3391                                              "assignment for second field "
3392                                              "in complementary field pair "
3393                                              "(first field is long term)\n");
3394         } else {
3395             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3396             if(pic){
3397                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3398             }
3399
3400             if(h->short_ref_count)
3401                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3402
3403             h->short_ref[0]= s->current_picture_ptr;
3404             h->short_ref_count++;
3405             s->current_picture_ptr->reference |= s->picture_structure;
3406         }
3407     }
3408
3409     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3410
3411         /* We have too many reference frames, probably due to corrupted
3412          * stream. Need to discard one frame. Prevents overrun of the
3413          * short_ref and long_ref buffers.
3414          */
3415         av_log(h->s.avctx, AV_LOG_ERROR,
3416                "number of reference frames exceeds max (probably "
3417                "corrupt input), discarding one\n");
3418
3419         if (h->long_ref_count && !h->short_ref_count) {
3420             for (i = 0; i < 16; ++i)
3421                 if (h->long_ref[i])
3422                     break;
3423
3424             assert(i < 16);
3425             remove_long(h, i, 0);
3426         } else {
3427             pic = h->short_ref[h->short_ref_count - 1];
3428             remove_short(h, pic->frame_num, 0);
3429         }
3430     }
3431
3432     print_short_term(h);
3433     print_long_term(h);
3434     return 0;
3435 }
3436
3437 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3438     MpegEncContext * const s = &h->s;
3439     int i;
3440
3441     h->mmco_index= 0;
3442     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3443         s->broken_link= get_bits1(gb) -1;
3444         if(get_bits1(gb)){
3445             h->mmco[0].opcode= MMCO_LONG;
3446             h->mmco[0].long_arg= 0;
3447             h->mmco_index= 1;
3448         }
3449     }else{
3450         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3451             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3452                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3453
3454                 h->mmco[i].opcode= opcode;
3455                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3456                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3457 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3458                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3459                         return -1;
3460                     }*/
3461                 }
3462                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3463                     unsigned int long_arg= get_ue_golomb_31(gb);
3464                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3465                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3466                         return -1;
3467                     }
3468                     h->mmco[i].long_arg= long_arg;
3469                 }
3470
3471                 if(opcode > (unsigned)MMCO_LONG){
3472                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3473                     return -1;
3474                 }
3475                 if(opcode == MMCO_END)
3476                     break;
3477             }
3478             h->mmco_index= i;
3479         }else{
3480             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3481
3482             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3483                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3484                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3485                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3486                 h->mmco_index= 1;
3487                 if (FIELD_PICTURE) {
3488                     h->mmco[0].short_pic_num *= 2;
3489                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3490                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3491                     h->mmco_index= 2;
3492                 }
3493             }
3494         }
3495     }
3496
3497     return 0;
3498 }
3499
3500 static int init_poc(H264Context *h){
3501     MpegEncContext * const s = &h->s;
3502     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3503     int field_poc[2];
3504     Picture *cur = s->current_picture_ptr;
3505
3506     h->frame_num_offset= h->prev_frame_num_offset;
3507     if(h->frame_num < h->prev_frame_num)
3508         h->frame_num_offset += max_frame_num;
3509
3510     if(h->sps.poc_type==0){
3511         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3512
3513         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3514             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3515         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3516             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3517         else
3518             h->poc_msb = h->prev_poc_msb;
3519 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3520         field_poc[0] =
3521         field_poc[1] = h->poc_msb + h->poc_lsb;
3522         if(s->picture_structure == PICT_FRAME)
3523             field_poc[1] += h->delta_poc_bottom;
3524     }else if(h->sps.poc_type==1){
3525         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3526         int i;
3527
3528         if(h->sps.poc_cycle_length != 0)
3529             abs_frame_num = h->frame_num_offset + h->frame_num;
3530         else
3531             abs_frame_num = 0;
3532
3533         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3534             abs_frame_num--;
3535
3536         expected_delta_per_poc_cycle = 0;
3537         for(i=0; i < h->sps.poc_cycle_length; i++)
3538             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3539
3540         if(abs_frame_num > 0){
3541             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3542             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3543
3544             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3545             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3546                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3547         } else
3548             expectedpoc = 0;
3549
3550         if(h->nal_ref_idc == 0)
3551             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3552
3553         field_poc[0] = expectedpoc + h->delta_poc[0];
3554         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3555
3556         if(s->picture_structure == PICT_FRAME)
3557             field_poc[1] += h->delta_poc[1];
3558     }else{
3559         int poc= 2*(h->frame_num_offset + h->frame_num);
3560
3561         if(!h->nal_ref_idc)
3562             poc--;
3563
3564         field_poc[0]= poc;
3565         field_poc[1]= poc;
3566     }
3567
3568     if(s->picture_structure != PICT_BOTTOM_FIELD)
3569         s->current_picture_ptr->field_poc[0]= field_poc[0];
3570     if(s->picture_structure != PICT_TOP_FIELD)
3571         s->current_picture_ptr->field_poc[1]= field_poc[1];
3572     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3573
3574     return 0;
3575 }
3576
3577
3578 /**
3579  * initialize scan tables
3580  */
3581 static void init_scan_tables(H264Context *h){
3582     MpegEncContext * const s = &h->s;
3583     int i;
3584     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3585         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3586         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3587     }else{
3588         for(i=0; i<16; i++){
3589 #define T(x) (x>>2) | ((x<<2) & 0xF)
3590             h->zigzag_scan[i] = T(zigzag_scan[i]);
3591             h-> field_scan[i] = T( field_scan[i]);
3592 #undef T
3593         }
3594     }
3595     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3596         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3597         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3598         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3599         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3600     }else{
3601         for(i=0; i<64; i++){
3602 #define T(x) (x>>3) | ((x&7)<<3)
3603             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3604             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3605             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3606             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3607 #undef T
3608         }
3609     }
3610     if(h->sps.transform_bypass){ //FIXME same ugly
3611         h->zigzag_scan_q0          = zigzag_scan;
3612         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3613         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3614         h->field_scan_q0           = field_scan;
3615         h->field_scan8x8_q0        = field_scan8x8;
3616         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3617     }else{
3618         h->zigzag_scan_q0          = h->zigzag_scan;
3619         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3620         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3621         h->field_scan_q0           = h->field_scan;
3622         h->field_scan8x8_q0        = h->field_scan8x8;
3623         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3624     }
3625 }
3626
3627 static void field_end(H264Context *h){
3628     MpegEncContext * const s = &h->s;
3629     AVCodecContext * const avctx= s->avctx;
3630     s->mb_y= 0;
3631
3632     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3633     s->current_picture_ptr->pict_type= s->pict_type;
3634
3635     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3636         ff_vdpau_h264_set_reference_frames(s);
3637
3638     if(!s->dropable) {
3639         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3640         h->prev_poc_msb= h->poc_msb;
3641         h->prev_poc_lsb= h->poc_lsb;
3642     }
3643     h->prev_frame_num_offset= h->frame_num_offset;
3644     h->prev_frame_num= h->frame_num;
3645
3646     if (avctx->hwaccel) {
3647         if (avctx->hwaccel->end_frame(avctx) < 0)
3648             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3649     }
3650
3651     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3652         ff_vdpau_h264_picture_complete(s);
3653
3654     /*
3655      * FIXME: Error handling code does not seem to support interlaced
3656      * when slices span multiple rows
3657      * The ff_er_add_slice calls don't work right for bottom
3658      * fields; they cause massive erroneous error concealing
3659      * Error marking covers both fields (top and bottom).
3660      * This causes a mismatched s->error_count
3661      * and a bad error table. Further, the error count goes to
3662      * INT_MAX when called for bottom field, because mb_y is
3663      * past end by one (callers fault) and resync_mb_y != 0
3664      * causes problems for the first MB line, too.
3665      */
3666     if (!FIELD_PICTURE)
3667         ff_er_frame_end(s);
3668
3669     MPV_frame_end(s);
3670
3671     h->current_slice=0;
3672 }
3673
3674 /**
3675  * Replicates H264 "master" context to thread contexts.
3676  */
3677 static void clone_slice(H264Context *dst, H264Context *src)
3678 {
3679     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3680     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3681     dst->s.current_picture      = src->s.current_picture;
3682     dst->s.linesize             = src->s.linesize;
3683     dst->s.uvlinesize           = src->s.uvlinesize;
3684     dst->s.first_field          = src->s.first_field;
3685
3686     dst->prev_poc_msb           = src->prev_poc_msb;
3687     dst->prev_poc_lsb           = src->prev_poc_lsb;
3688     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3689     dst->prev_frame_num         = src->prev_frame_num;
3690     dst->short_ref_count        = src->short_ref_count;
3691
3692     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3693     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3694     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3695     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3696
3697     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3698     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3699 }
3700
3701 /**
3702  * decodes a slice header.
3703  * This will also call MPV_common_init() and frame_start() as needed.
3704  *
3705  * @param h h264context
3706  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3707  *
3708  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3709  */
3710 static int decode_slice_header(H264Context *h, H264Context *h0){
3711     MpegEncContext * const s = &h->s;
3712     MpegEncContext * const s0 = &h0->s;
3713     unsigned int first_mb_in_slice;
3714     unsigned int pps_id;
3715     int num_ref_idx_active_override_flag;
3716     unsigned int slice_type, tmp, i, j;
3717     int default_ref_list_done = 0;
3718     int last_pic_structure;
3719
3720     s->dropable= h->nal_ref_idc == 0;
3721
3722     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3723         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3724         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3725     }else{
3726         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3727         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3728     }
3729
3730     first_mb_in_slice= get_ue_golomb(&s->gb);
3731
3732     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3733         if(h0->current_slice && FIELD_PICTURE){
3734             field_end(h);
3735         }
3736
3737         h0->current_slice = 0;
3738         if (!s0->first_field)
3739             s->current_picture_ptr= NULL;
3740     }
3741
3742     slice_type= get_ue_golomb_31(&s->gb);
3743     if(slice_type > 9){
3744         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3745         return -1;
3746     }
3747     if(slice_type > 4){
3748         slice_type -= 5;
3749         h->slice_type_fixed=1;
3750     }else
3751         h->slice_type_fixed=0;
3752
3753     slice_type= golomb_to_pict_type[ slice_type ];
3754     if (slice_type == FF_I_TYPE
3755         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3756         default_ref_list_done = 1;
3757     }
3758     h->slice_type= slice_type;
3759     h->slice_type_nos= slice_type & 3;
3760
3761     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3762     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3763         av_log(h->s.avctx, AV_LOG_ERROR,
3764                "B picture before any references, skipping\n");
3765         return -1;
3766     }
3767
3768     pps_id= get_ue_golomb(&s->gb);
3769     if(pps_id>=MAX_PPS_COUNT){
3770         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3771         return -1;
3772     }
3773     if(!h0->pps_buffers[pps_id]) {
3774         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3775         return -1;
3776     }
3777     h->pps= *h0->pps_buffers[pps_id];
3778
3779     if(!h0->sps_buffers[h->pps.sps_id]) {
3780         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3781         return -1;
3782     }
3783     h->sps = *h0->sps_buffers[h->pps.sps_id];
3784
3785     if(h == h0 && h->dequant_coeff_pps != pps_id){
3786         h->dequant_coeff_pps = pps_id;
3787         init_dequant_tables(h);
3788     }
3789
3790     s->mb_width= h->sps.mb_width;
3791     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3792
3793     h->b_stride=  s->mb_width*4;
3794     h->b8_stride= s->mb_width*2;
3795
3796     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3797     if(h->sps.frame_mbs_only_flag)
3798         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3799     else
3800         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3801
3802     if (s->context_initialized
3803         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3804         if(h != h0)
3805             return -1;   // width / height changed during parallelized decoding
3806         free_tables(h);
3807         flush_dpb(s->avctx);
3808         MPV_common_end(s);
3809     }
3810     if (!s->context_initialized) {
3811         if(h != h0)
3812             return -1;  // we cant (re-)initialize context during parallel decoding
3813         if (MPV_common_init(s) < 0)
3814             return -1;
3815         s->first_field = 0;
3816         h->prev_interlaced_frame = 1;
3817
3818         init_scan_tables(h);
3819         alloc_tables(h);
3820
3821         for(i = 1; i < s->avctx->thread_count; i++) {
3822             H264Context *c;
3823             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3824             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3825             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3826             c->sps = h->sps;
3827             c->pps = h->pps;
3828             init_scan_tables(c);
3829             clone_tables(c, h);
3830         }
3831
3832         for(i = 0; i < s->avctx->thread_count; i++)
3833             if(context_init(h->thread_context[i]) < 0)
3834                 return -1;
3835
3836         s->avctx->width = s->width;
3837         s->avctx->height = s->height;
3838         s->avctx->sample_aspect_ratio= h->sps.sar;
3839         if(!s->avctx->sample_aspect_ratio.den)
3840             s->avctx->sample_aspect_ratio.den = 1;
3841
3842         if(h->sps.timing_info_present_flag){
3843             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3844             if(h->x264_build > 0 && h->x264_build < 44)
3845                 s->avctx->time_base.den *= 2;
3846             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3847                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3848         }
3849     }
3850
3851     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3852
3853     h->mb_mbaff = 0;
3854     h->mb_aff_frame = 0;
3855     last_pic_structure = s0->picture_structure;
3856     if(h->sps.frame_mbs_only_flag){
3857         s->picture_structure= PICT_FRAME;
3858     }else{
3859         if(get_bits1(&s->gb)) { //field_pic_flag
3860             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3861         } else {
3862             s->picture_structure= PICT_FRAME;
3863             h->mb_aff_frame = h->sps.mb_aff;
3864         }
3865     }
3866     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3867
3868     if(h0->current_slice == 0){
3869         while(h->frame_num !=  h->prev_frame_num &&
3870               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3871             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3872             if (frame_start(h) < 0)
3873                 return -1;
3874             h->prev_frame_num++;
3875             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3876             s->current_picture_ptr->frame_num= h->prev_frame_num;
3877             execute_ref_pic_marking(h, NULL, 0);
3878         }
3879
3880         /* See if we have a decoded first field looking for a pair... */
3881         if (s0->first_field) {
3882             assert(s0->current_picture_ptr);
3883             assert(s0->current_picture_ptr->data[0]);
3884             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3885
3886             /* figure out if we have a complementary field pair */
3887             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3888                 /*
3889                  * Previous field is unmatched. Don't display it, but let it
3890                  * remain for reference if marked as such.
3891                  */
3892                 s0->current_picture_ptr = NULL;
3893                 s0->first_field = FIELD_PICTURE;
3894
3895             } else {
3896                 if (h->nal_ref_idc &&
3897                         s0->current_picture_ptr->reference &&
3898                         s0->current_picture_ptr->frame_num != h->frame_num) {
3899                     /*
3900                      * This and previous field were reference, but had
3901                      * different frame_nums. Consider this field first in
3902                      * pair. Throw away previous field except for reference
3903                      * purposes.
3904                      */
3905                     s0->first_field = 1;
3906                     s0->current_picture_ptr = NULL;
3907
3908                 } else {
3909                     /* Second field in complementary pair */
3910                     s0->first_field = 0;
3911                 }
3912             }
3913
3914         } else {
3915             /* Frame or first field in a potentially complementary pair */
3916             assert(!s0->current_picture_ptr);
3917             s0->first_field = FIELD_PICTURE;
3918         }
3919
3920         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3921             s0->first_field = 0;
3922             return -1;
3923         }
3924     }
3925     if(h != h0)
3926         clone_slice(h, h0);
3927
3928     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3929
3930     assert(s->mb_num == s->mb_width * s->mb_height);
3931     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3932        first_mb_in_slice                    >= s->mb_num){
3933         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3934         return -1;
3935     }
3936     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3937     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3938     if (s->picture_structure == PICT_BOTTOM_FIELD)
3939         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3940     assert(s->mb_y < s->mb_height);
3941
3942     if(s->picture_structure==PICT_FRAME){
3943         h->curr_pic_num=   h->frame_num;
3944         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3945     }else{
3946         h->curr_pic_num= 2*h->frame_num + 1;
3947         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3948     }
3949
3950     if(h->nal_unit_type == NAL_IDR_SLICE){
3951         get_ue_golomb(&s->gb); /* idr_pic_id */
3952     }
3953
3954     if(h->sps.poc_type==0){
3955         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3956
3957         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3958             h->delta_poc_bottom= get_se_golomb(&s->gb);
3959         }
3960     }
3961
3962     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3963         h->delta_poc[0]= get_se_golomb(&s->gb);
3964
3965         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3966             h->delta_poc[1]= get_se_golomb(&s->gb);
3967     }
3968
3969     init_poc(h);
3970
3971     if(h->pps.redundant_pic_cnt_present){
3972         h->redundant_pic_count= get_ue_golomb(&s->gb);
3973     }
3974
3975     //set defaults, might be overridden a few lines later
3976     h->ref_count[0]= h->pps.ref_count[0];
3977     h->ref_count[1]= h->pps.ref_count[1];
3978
3979     if(h->slice_type_nos != FF_I_TYPE){
3980         if(h->slice_type_nos == FF_B_TYPE){
3981             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3982         }
3983         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3984
3985         if(num_ref_idx_active_override_flag){
3986             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3987             if(h->slice_type_nos==FF_B_TYPE)
3988                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3989
3990             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3991                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3992                 h->ref_count[0]= h->ref_count[1]= 1;
3993                 return -1;
3994             }
3995         }
3996         if(h->slice_type_nos == FF_B_TYPE)
3997             h->list_count= 2;
3998         else
3999             h->list_count= 1;
4000     }else
4001         h->list_count= 0;
4002
4003     if(!default_ref_list_done){
4004         fill_default_ref_list(h);
4005     }
4006
4007     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4008         return -1;
4009
4010     if(h->slice_type_nos!=FF_I_TYPE){
4011         s->last_picture_ptr= &h->ref_list[0][0];
4012         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4013     }
4014     if(h->slice_type_nos==FF_B_TYPE){
4015         s->next_picture_ptr= &h->ref_list[1][0];
4016         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4017     }
4018
4019     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4020        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4021         pred_weight_table(h);
4022     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4023         implicit_weight_table(h);
4024     else {
4025         h->use_weight = 0;
4026         for (i = 0; i < 2; i++) {
4027             h->luma_weight_flag[i]   = 0;
4028             h->chroma_weight_flag[i] = 0;
4029         }
4030     }
4031
4032     if(h->nal_ref_idc)
4033         decode_ref_pic_marking(h0, &s->gb);
4034
4035     if(FRAME_MBAFF)
4036         fill_mbaff_ref_list(h);
4037
4038     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4039         direct_dist_scale_factor(h);
4040     direct_ref_list_init(h);
4041
4042     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4043         tmp = get_ue_golomb_31(&s->gb);
4044         if(tmp > 2){
4045             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4046             return -1;
4047         }
4048         h->cabac_init_idc= tmp;
4049     }
4050
4051     h->last_qscale_diff = 0;
4052     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4053     if(tmp>51){
4054         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4055         return -1;
4056     }
4057     s->qscale= tmp;
4058     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4059     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4060     //FIXME qscale / qp ... stuff
4061     if(h->slice_type == FF_SP_TYPE){
4062         get_bits1(&s->gb); /* sp_for_switch_flag */
4063     }
4064     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4065         get_se_golomb(&s->gb); /* slice_qs_delta */
4066     }
4067
4068     h->deblocking_filter = 1;
4069     h->slice_alpha_c0_offset = 0;
4070     h->slice_beta_offset = 0;
4071     if( h->pps.deblocking_filter_parameters_present ) {
4072         tmp= get_ue_golomb_31(&s->gb);
4073         if(tmp > 2){
4074             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4075             return -1;
4076         }
4077         h->deblocking_filter= tmp;
4078         if(h->deblocking_filter < 2)
4079             h->deblocking_filter^= 1; // 1<->0
4080
4081         if( h->deblocking_filter ) {
4082             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4083             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4084         }
4085     }
4086
4087     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4088        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4089        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4090        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4091         h->deblocking_filter= 0;
4092
4093     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4094         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4095             /* Cheat slightly for speed:
4096                Do not bother to deblock across slices. */
4097             h->deblocking_filter = 2;
4098         } else {
4099             h0->max_contexts = 1;
4100             if(!h0->single_decode_warning) {
4101                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4102                 h0->single_decode_warning = 1;
4103             }
4104             if(h != h0)
4105                 return 1; // deblocking switched inside frame
4106         }
4107     }
4108
4109 #if 0 //FMO
4110     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4111         slice_group_change_cycle= get_bits(&s->gb, ?);
4112 #endif
4113
4114     h0->last_slice_type = slice_type;
4115     h->slice_num = ++h0->current_slice;
4116     if(h->slice_num >= MAX_SLICES){
4117         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4118     }
4119
4120     for(j=0; j<2; j++){
4121         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4122         ref2frm[0]=
4123         ref2frm[1]= -1;
4124         for(i=0; i<16; i++)
4125             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4126                           +(h->ref_list[j][i].reference&3);
4127         ref2frm[18+0]=
4128         ref2frm[18+1]= -1;
4129         for(i=16; i<48; i++)
4130             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4131                           +(h->ref_list[j][i].reference&3);
4132     }
4133
4134     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4135     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4136
4137     s->avctx->refs= h->sps.ref_frame_count;
4138
4139     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4140         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4141                h->slice_num,
4142                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4143                first_mb_in_slice,
4144                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4145                pps_id, h->frame_num,
4146                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4147                h->ref_count[0], h->ref_count[1],
4148                s->qscale,
4149                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4150                h->use_weight,
4151                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4152                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4153                );
4154     }
4155
4156     return 0;
4157 }
4158
4159 /**
4160  *
4161  */
4162 static inline int get_level_prefix(GetBitContext *gb){
4163     unsigned int buf;
4164     int log;
4165
4166     OPEN_READER(re, gb);
4167     UPDATE_CACHE(re, gb);
4168     buf=GET_CACHE(re, gb);
4169
4170     log= 32 - av_log2(buf);
4171 #ifdef TRACE
4172     print_bin(buf>>(32-log), log);
4173     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4174 #endif
4175
4176     LAST_SKIP_BITS(re, gb, log);
4177     CLOSE_READER(re, gb);
4178
4179     return log-1;
4180 }
4181
4182 static inline int get_dct8x8_allowed(H264Context *h){
4183     if(h->sps.direct_8x8_inference_flag)
4184         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4185     else
4186         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4187 }
4188
4189 /**
4190  * decodes a residual block.
4191  * @param n block index
4192  * @param scantable scantable
4193  * @param max_coeff number of coefficients in the block
4194  * @return <0 if an error occurred
4195  */
4196 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4197     MpegEncContext * const s = &h->s;
4198     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4199     int level[16];
4200     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4201
4202     //FIXME put trailing_onex into the context
4203
4204     if(n == CHROMA_DC_BLOCK_INDEX){
4205         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4206         total_coeff= coeff_token>>2;
4207     }else{
4208         if(n == LUMA_DC_BLOCK_INDEX){
4209             total_coeff= pred_non_zero_count(h, 0);
4210             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4211             total_coeff= coeff_token>>2;
4212         }else{
4213             total_coeff= pred_non_zero_count(h, n);
4214             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4215             total_coeff= coeff_token>>2;
4216             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4217         }
4218     }
4219
4220     //FIXME set last_non_zero?
4221
4222     if(total_coeff==0)
4223         return 0;
4224     if(total_coeff > (unsigned)max_coeff) {
4225         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4226         return -1;
4227     }
4228
4229     trailing_ones= coeff_token&3;
4230     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4231     assert(total_coeff<=16);
4232
4233     i = show_bits(gb, 3);
4234     skip_bits(gb, trailing_ones);
4235     level[0] = 1-((i&4)>>1);
4236     level[1] = 1-((i&2)   );
4237     level[2] = 1-((i&1)<<1);
4238
4239     if(trailing_ones<total_coeff) {
4240         int mask, prefix;
4241         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4242         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4243         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4244
4245         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4246         if(level_code >= 100){
4247             prefix= level_code - 100;
4248             if(prefix == LEVEL_TAB_BITS)
4249                 prefix += get_level_prefix(gb);
4250
4251             //first coefficient has suffix_length equal to 0 or 1
4252             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4253                 if(suffix_length)
4254                     level_code= (prefix<<1) + get_bits1(gb); //part
4255                 else
4256                     level_code= prefix; //part
4257             }else if(prefix==14){
4258                 if(suffix_length)
4259                     level_code= (prefix<<1) + get_bits1(gb); //part
4260                 else
4261                     level_code= prefix + get_bits(gb, 4); //part
4262             }else{
4263                 level_code= 30 + get_bits(gb, prefix-3); //part
4264                 if(prefix>=16)
4265                     level_code += (1<<(prefix-3))-4096;
4266             }
4267
4268             if(trailing_ones < 3) level_code += 2;
4269
4270             suffix_length = 2;
4271             mask= -(level_code&1);
4272             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4273         }else{
4274             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4275
4276             suffix_length = 1;
4277             if(level_code + 3U > 6U)
4278                 suffix_length++;
4279             level[trailing_ones]= level_code;
4280         }
4281
4282         //remaining coefficients have suffix_length > 0
4283         for(i=trailing_ones+1;i<total_coeff;i++) {
4284             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4285             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4286             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4287
4288             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4289             if(level_code >= 100){
4290                 prefix= level_code - 100;
4291                 if(prefix == LEVEL_TAB_BITS){
4292                     prefix += get_level_prefix(gb);
4293                 }
4294                 if(prefix<15){
4295                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4296                 }else{
4297                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4298                     if(prefix>=16)
4299                         level_code += (1<<(prefix-3))-4096;
4300                 }
4301                 mask= -(level_code&1);
4302                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4303             }
4304             level[i]= level_code;
4305
4306             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4307                 suffix_length++;
4308         }
4309     }
4310
4311     if(total_coeff == max_coeff)
4312         zeros_left=0;
4313     else{
4314         if(n == CHROMA_DC_BLOCK_INDEX)
4315             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4316         else
4317             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4318     }
4319
4320     coeff_num = zeros_left + total_coeff - 1;
4321     j = scantable[coeff_num];
4322     if(n > 24){
4323         block[j] = level[0];
4324         for(i=1;i<total_coeff;i++) {
4325             if(zeros_left <= 0)
4326                 run_before = 0;
4327             else if(zeros_left < 7){
4328                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4329             }else{
4330                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4331             }
4332             zeros_left -= run_before;
4333             coeff_num -= 1 + run_before;
4334             j= scantable[ coeff_num ];
4335
4336             block[j]= level[i];
4337         }
4338     }else{
4339         block[j] = (level[0] * qmul[j] + 32)>>6;
4340         for(i=1;i<total_coeff;i++) {
4341             if(zeros_left <= 0)
4342                 run_before = 0;
4343             else if(zeros_left < 7){
4344                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4345             }else{
4346                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4347             }
4348             zeros_left -= run_before;
4349             coeff_num -= 1 + run_before;
4350             j= scantable[ coeff_num ];
4351
4352             block[j]= (level[i] * qmul[j] + 32)>>6;
4353         }
4354     }
4355
4356     if(zeros_left<0){
4357         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4358         return -1;
4359     }
4360
4361     return 0;
4362 }
4363
4364 static void predict_field_decoding_flag(H264Context *h){
4365     MpegEncContext * const s = &h->s;
4366     const int mb_xy= h->mb_xy;
4367     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4368                 ? s->current_picture.mb_type[mb_xy-1]
4369                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4370                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4371                 : 0;
4372     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4373 }
4374
4375 /**
4376  * decodes a P_SKIP or B_SKIP macroblock
4377  */
4378 static void decode_mb_skip(H264Context *h){
4379     MpegEncContext * const s = &h->s;
4380     const int mb_xy= h->mb_xy;
4381     int mb_type=0;
4382
4383     memset(h->non_zero_count[mb_xy], 0, 16);
4384     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4385
4386     if(MB_FIELD)
4387         mb_type|= MB_TYPE_INTERLACED;
4388
4389     if( h->slice_type_nos == FF_B_TYPE )
4390     {
4391         // just for fill_caches. pred_direct_motion will set the real mb_type
4392         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4393
4394         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4395         pred_direct_motion(h, &mb_type);
4396         mb_type|= MB_TYPE_SKIP;
4397     }
4398     else
4399     {
4400         int mx, my;
4401         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4402
4403         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4404         pred_pskip_motion(h, &mx, &my);
4405         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4406         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4407     }
4408
4409     write_back_motion(h, mb_type);
4410     s->current_picture.mb_type[mb_xy]= mb_type;
4411     s->current_picture.qscale_table[mb_xy]= s->qscale;
4412     h->slice_table[ mb_xy ]= h->slice_num;
4413     h->prev_mb_skipped= 1;
4414 }
4415
4416 /**
4417  * decodes a macroblock
4418  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4419  */
4420 static int decode_mb_cavlc(H264Context *h){
4421     MpegEncContext * const s = &h->s;
4422     int mb_xy;
4423     int partition_count;
4424     unsigned int mb_type, cbp;
4425     int dct8x8_allowed= h->pps.transform_8x8_mode;
4426
4427     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4428
4429     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4430     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4431                 down the code */
4432     if(h->slice_type_nos != FF_I_TYPE){
4433         if(s->mb_skip_run==-1)
4434             s->mb_skip_run= get_ue_golomb(&s->gb);
4435
4436         if (s->mb_skip_run--) {
4437             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4438                 if(s->mb_skip_run==0)
4439                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4440                 else
4441                     predict_field_decoding_flag(h);
4442             }
4443             decode_mb_skip(h);
4444             return 0;
4445         }
4446     }
4447     if(FRAME_MBAFF){
4448         if( (s->mb_y&1) == 0 )
4449             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4450     }
4451
4452     h->prev_mb_skipped= 0;
4453
4454     mb_type= get_ue_golomb(&s->gb);
4455     if(h->slice_type_nos == FF_B_TYPE){
4456         if(mb_type < 23){
4457             partition_count= b_mb_type_info[mb_type].partition_count;
4458             mb_type=         b_mb_type_info[mb_type].type;
4459         }else{
4460             mb_type -= 23;
4461             goto decode_intra_mb;
4462         }
4463     }else if(h->slice_type_nos == FF_P_TYPE){
4464         if(mb_type < 5){
4465             partition_count= p_mb_type_info[mb_type].partition_count;
4466             mb_type=         p_mb_type_info[mb_type].type;
4467         }else{
4468             mb_type -= 5;
4469             goto decode_intra_mb;
4470         }
4471     }else{
4472        assert(h->slice_type_nos == FF_I_TYPE);
4473         if(h->slice_type == FF_SI_TYPE && mb_type)
4474             mb_type--;
4475 decode_intra_mb:
4476         if(mb_type > 25){
4477             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4478             return -1;
4479         }
4480         partition_count=0;
4481         cbp= i_mb_type_info[mb_type].cbp;
4482         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4483         mb_type= i_mb_type_info[mb_type].type;
4484     }
4485
4486     if(MB_FIELD)
4487         mb_type |= MB_TYPE_INTERLACED;
4488
4489     h->slice_table[ mb_xy ]= h->slice_num;
4490
4491     if(IS_INTRA_PCM(mb_type)){
4492         unsigned int x;
4493
4494         // We assume these blocks are very rare so we do not optimize it.
4495         align_get_bits(&s->gb);
4496
4497         // The pixels are stored in the same order as levels in h->mb array.
4498         for(x=0; x < (CHROMA ? 384 : 256); x++){
4499             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4500         }
4501
4502         // In deblocking, the quantizer is 0
4503         s->current_picture.qscale_table[mb_xy]= 0;
4504         // All coeffs are present
4505         memset(h->non_zero_count[mb_xy], 16, 16);
4506
4507         s->current_picture.mb_type[mb_xy]= mb_type;
4508         return 0;
4509     }
4510
4511     if(MB_MBAFF){
4512         h->ref_count[0] <<= 1;
4513         h->ref_count[1] <<= 1;
4514     }
4515
4516     fill_caches(h, mb_type, 0);
4517
4518     //mb_pred
4519     if(IS_INTRA(mb_type)){
4520         int pred_mode;
4521 //            init_top_left_availability(h);
4522         if(IS_INTRA4x4(mb_type)){
4523             int i;
4524             int di = 1;
4525             if(dct8x8_allowed && get_bits1(&s->gb)){
4526                 mb_type |= MB_TYPE_8x8DCT;
4527                 di = 4;
4528             }
4529
4530 //                fill_intra4x4_pred_table(h);
4531             for(i=0; i<16; i+=di){
4532                 int mode= pred_intra_mode(h, i);
4533
4534                 if(!get_bits1(&s->gb)){
4535                     const int rem_mode= get_bits(&s->gb, 3);
4536                     mode = rem_mode + (rem_mode >= mode);
4537                 }
4538
4539                 if(di==4)
4540                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4541                 else
4542                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4543             }
4544             write_back_intra_pred_mode(h);
4545             if( check_intra4x4_pred_mode(h) < 0)
4546                 return -1;
4547         }else{
4548             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4549             if(h->intra16x16_pred_mode < 0)
4550                 return -1;
4551         }
4552         if(CHROMA){
4553             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4554             if(pred_mode < 0)
4555                 return -1;
4556             h->chroma_pred_mode= pred_mode;
4557         }
4558     }else if(partition_count==4){
4559         int i, j, sub_partition_count[4], list, ref[2][4];
4560
4561         if(h->slice_type_nos == FF_B_TYPE){
4562             for(i=0; i<4; i++){
4563                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4564                 if(h->sub_mb_type[i] >=13){
4565                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4566                     return -1;
4567                 }
4568                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4569                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4570             }
4571             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4572                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4573                 pred_direct_motion(h, &mb_type);
4574                 h->ref_cache[0][scan8[4]] =
4575                 h->ref_cache[1][scan8[4]] =
4576                 h->ref_cache[0][scan8[12]] =
4577                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4578             }
4579         }else{
4580             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4581             for(i=0; i<4; i++){
4582                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4583                 if(h->sub_mb_type[i] >=4){
4584                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4585                     return -1;
4586                 }
4587                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4588                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4589             }
4590         }
4591
4592         for(list=0; list<h->list_count; list++){
4593             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4594             for(i=0; i<4; i++){
4595                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4596                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4597                     unsigned int tmp;
4598                     if(ref_count == 1){
4599                         tmp= 0;
4600                     }else if(ref_count == 2){
4601                         tmp= get_bits1(&s->gb)^1;
4602                     }else{
4603                         tmp= get_ue_golomb_31(&s->gb);
4604                         if(tmp>=ref_count){
4605                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4606                             return -1;
4607                         }
4608                     }
4609                     ref[list][i]= tmp;
4610                 }else{
4611                  //FIXME
4612                     ref[list][i] = -1;
4613                 }
4614             }
4615         }
4616
4617         if(dct8x8_allowed)
4618             dct8x8_allowed = get_dct8x8_allowed(h);
4619
4620         for(list=0; list<h->list_count; list++){
4621             for(i=0; i<4; i++){
4622                 if(IS_DIRECT(h->sub_mb_type[i])) {
4623                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4624                     continue;
4625                 }
4626                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4627                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4628
4629                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4630                     const int sub_mb_type= h->sub_mb_type[i];
4631                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4632                     for(j=0; j<sub_partition_count[i]; j++){
4633                         int mx, my;
4634                         const int index= 4*i + block_width*j;
4635                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4636                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4637                         mx += get_se_golomb(&s->gb);
4638                         my += get_se_golomb(&s->gb);
4639                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4640
4641                         if(IS_SUB_8X8(sub_mb_type)){
4642                             mv_cache[ 1 ][0]=
4643                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4644                             mv_cache[ 1 ][1]=
4645                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4646                         }else if(IS_SUB_8X4(sub_mb_type)){
4647                             mv_cache[ 1 ][0]= mx;
4648                             mv_cache[ 1 ][1]= my;
4649                         }else if(IS_SUB_4X8(sub_mb_type)){
4650                             mv_cache[ 8 ][0]= mx;
4651                             mv_cache[ 8 ][1]= my;
4652                         }
4653                         mv_cache[ 0 ][0]= mx;
4654                         mv_cache[ 0 ][1]= my;
4655                     }
4656                 }else{
4657                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4658                     p[0] = p[1]=
4659                     p[8] = p[9]= 0;
4660                 }
4661             }
4662         }
4663     }else if(IS_DIRECT(mb_type)){
4664         pred_direct_motion(h, &mb_type);
4665         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4666     }else{
4667         int list, mx, my, i;
4668          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4669         if(IS_16X16(mb_type)){
4670             for(list=0; list<h->list_count; list++){
4671                     unsigned int val;
4672                     if(IS_DIR(mb_type, 0, list)){
4673                         if(h->ref_count[list]==1){
4674                             val= 0;
4675                         }else if(h->ref_count[list]==2){
4676                             val= get_bits1(&s->gb)^1;
4677                         }else{
4678                             val= get_ue_golomb_31(&s->gb);
4679                             if(val >= h->ref_count[list]){
4680                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4681                                 return -1;
4682                             }
4683                         }
4684                     }else
4685                         val= LIST_NOT_USED&0xFF;
4686                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4687             }
4688             for(list=0; list<h->list_count; list++){
4689                 unsigned int val;
4690                 if(IS_DIR(mb_type, 0, list)){
4691                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4692                     mx += get_se_golomb(&s->gb);
4693                     my += get_se_golomb(&s->gb);
4694                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4695
4696                     val= pack16to32(mx,my);
4697                 }else
4698                     val=0;
4699                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4700             }
4701         }
4702         else if(IS_16X8(mb_type)){
4703             for(list=0; list<h->list_count; list++){
4704                     for(i=0; i<2; i++){
4705                         unsigned int val;
4706                         if(IS_DIR(mb_type, i, list)){
4707                             if(h->ref_count[list] == 1){
4708                                 val= 0;
4709                             }else if(h->ref_count[list] == 2){
4710                                 val= get_bits1(&s->gb)^1;
4711                             }else{
4712                                 val= get_ue_golomb_31(&s->gb);
4713                                 if(val >= h->ref_count[list]){
4714                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4715                                     return -1;
4716                                 }
4717                             }
4718                         }else
4719                             val= LIST_NOT_USED&0xFF;
4720                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4721                     }
4722             }
4723             for(list=0; list<h->list_count; list++){
4724                 for(i=0; i<2; i++){
4725                     unsigned int val;
4726                     if(IS_DIR(mb_type, i, list)){
4727                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4728                         mx += get_se_golomb(&s->gb);
4729                         my += get_se_golomb(&s->gb);
4730                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4731
4732                         val= pack16to32(mx,my);
4733                     }else
4734                         val=0;
4735                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4736                 }
4737             }
4738         }else{
4739             assert(IS_8X16(mb_type));
4740             for(list=0; list<h->list_count; list++){
4741                     for(i=0; i<2; i++){
4742                         unsigned int val;
4743                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4744                             if(h->ref_count[list]==1){
4745                                 val= 0;
4746                             }else if(h->ref_count[list]==2){
4747                                 val= get_bits1(&s->gb)^1;
4748                             }else{
4749                                 val= get_ue_golomb_31(&s->gb);
4750                                 if(val >= h->ref_count[list]){
4751                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4752                                     return -1;
4753                                 }
4754                             }
4755                         }else
4756                             val= LIST_NOT_USED&0xFF;
4757                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4758                     }
4759             }
4760             for(list=0; list<h->list_count; list++){
4761                 for(i=0; i<2; i++){
4762                     unsigned int val;
4763                     if(IS_DIR(mb_type, i, list)){
4764                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4765                         mx += get_se_golomb(&s->gb);
4766                         my += get_se_golomb(&s->gb);
4767                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4768
4769                         val= pack16to32(mx,my);
4770                     }else
4771                         val=0;
4772                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4773                 }
4774             }
4775         }
4776     }
4777
4778     if(IS_INTER(mb_type))
4779         write_back_motion(h, mb_type);
4780
4781     if(!IS_INTRA16x16(mb_type)){
4782         cbp= get_ue_golomb(&s->gb);
4783         if(cbp > 47){
4784             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4785             return -1;
4786         }
4787
4788         if(CHROMA){
4789             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4790             else                     cbp= golomb_to_inter_cbp   [cbp];
4791         }else{
4792             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4793             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4794         }
4795     }
4796     h->cbp = cbp;
4797
4798     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4799         if(get_bits1(&s->gb)){
4800             mb_type |= MB_TYPE_8x8DCT;
4801             h->cbp_table[mb_xy]= cbp;
4802         }
4803     }
4804     s->current_picture.mb_type[mb_xy]= mb_type;
4805
4806     if(cbp || IS_INTRA16x16(mb_type)){
4807         int i8x8, i4x4, chroma_idx;
4808         int dquant;
4809         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4810         const uint8_t *scan, *scan8x8, *dc_scan;
4811
4812 //        fill_non_zero_count_cache(h);
4813
4814         if(IS_INTERLACED(mb_type)){
4815             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4816             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4817             dc_scan= luma_dc_field_scan;
4818         }else{
4819             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4820             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4821             dc_scan= luma_dc_zigzag_scan;
4822         }
4823
4824         dquant= get_se_golomb(&s->gb);
4825
4826         if( dquant > 25 || dquant < -26 ){
4827             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4828             return -1;
4829         }
4830
4831         s->qscale += dquant;
4832         if(((unsigned)s->qscale) > 51){
4833             if(s->qscale<0) s->qscale+= 52;
4834             else            s->qscale-= 52;
4835         }
4836
4837         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4838         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4839         if(IS_INTRA16x16(mb_type)){
4840             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4841                 return -1; //FIXME continue if partitioned and other return -1 too
4842             }
4843
4844             assert((cbp&15) == 0 || (cbp&15) == 15);
4845
4846             if(cbp&15){
4847                 for(i8x8=0; i8x8<4; i8x8++){
4848                     for(i4x4=0; i4x4<4; i4x4++){
4849                         const int index= i4x4 + 4*i8x8;
4850                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4851                             return -1;
4852                         }
4853                     }
4854                 }
4855             }else{
4856                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4857             }
4858         }else{
4859             for(i8x8=0; i8x8<4; i8x8++){
4860                 if(cbp & (1<<i8x8)){
4861                     if(IS_8x8DCT(mb_type)){
4862                         DCTELEM *buf = &h->mb[64*i8x8];
4863                         uint8_t *nnz;
4864                         for(i4x4=0; i4x4<4; i4x4++){
4865                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4866                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4867                                 return -1;
4868                         }
4869                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4870                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4871                     }else{
4872                         for(i4x4=0; i4x4<4; i4x4++){
4873                             const int index= i4x4 + 4*i8x8;
4874
4875                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4876                                 return -1;
4877                             }
4878                         }
4879                     }
4880                 }else{
4881                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4882                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4883                 }
4884             }
4885         }
4886
4887         if(cbp&0x30){
4888             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4889                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4890                     return -1;
4891                 }
4892         }
4893
4894         if(cbp&0x20){
4895             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4896                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4897                 for(i4x4=0; i4x4<4; i4x4++){
4898                     const int index= 16 + 4*chroma_idx + i4x4;
4899                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4900                         return -1;
4901                     }
4902                 }
4903             }
4904         }else{
4905             uint8_t * const nnz= &h->non_zero_count_cache[0];
4906             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4907             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4908         }
4909     }else{
4910         uint8_t * const nnz= &h->non_zero_count_cache[0];
4911         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4912         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4913         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4914     }
4915     s->current_picture.qscale_table[mb_xy]= s->qscale;
4916     write_back_non_zero_count(h);
4917
4918     if(MB_MBAFF){
4919         h->ref_count[0] >>= 1;
4920         h->ref_count[1] >>= 1;
4921     }
4922
4923     return 0;
4924 }
4925
4926 static int decode_cabac_field_decoding_flag(H264Context *h) {
4927     MpegEncContext * const s = &h->s;
4928     const int mb_x = s->mb_x;
4929     const int mb_y = s->mb_y & ~1;
4930     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4931     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4932
4933     unsigned int ctx = 0;
4934
4935     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4936         ctx += 1;
4937     }
4938     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4939         ctx += 1;
4940     }
4941
4942     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4943 }
4944
4945 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4946     uint8_t *state= &h->cabac_state[ctx_base];
4947     int mb_type;
4948
4949     if(intra_slice){
4950         MpegEncContext * const s = &h->s;
4951         const int mba_xy = h->left_mb_xy[0];
4952         const int mbb_xy = h->top_mb_xy;
4953         int ctx=0;
4954         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4955             ctx++;
4956         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4957             ctx++;
4958         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4959             return 0;   /* I4x4 */
4960         state += 2;
4961     }else{
4962         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4963             return 0;   /* I4x4 */
4964     }
4965
4966     if( get_cabac_terminate( &h->cabac ) )
4967         return 25;  /* PCM */
4968
4969     mb_type = 1; /* I16x16 */
4970     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4971     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4972         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4973     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4974     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4975     return mb_type;
4976 }
4977
4978 static int decode_cabac_mb_type_b( H264Context *h ) {
4979     MpegEncContext * const s = &h->s;
4980
4981         const int mba_xy = h->left_mb_xy[0];
4982         const int mbb_xy = h->top_mb_xy;
4983         int ctx = 0;
4984         int bits;
4985         assert(h->slice_type_nos == FF_B_TYPE);
4986
4987         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4988             ctx++;
4989         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4990             ctx++;
4991
4992         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4993             return 0; /* B_Direct_16x16 */
4994
4995         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4996             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4997         }
4998
4999         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5000         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5001         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5002         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5003         if( bits < 8 )
5004             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5005         else if( bits == 13 ) {
5006             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5007         } else if( bits == 14 )
5008             return 11; /* B_L1_L0_8x16 */
5009         else if( bits == 15 )
5010             return 22; /* B_8x8 */
5011
5012         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5013         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5014 }
5015
5016 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5017     MpegEncContext * const s = &h->s;
5018     int mba_xy, mbb_xy;
5019     int ctx = 0;
5020
5021     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5022         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5023         mba_xy = mb_xy - 1;
5024         if( (mb_y&1)
5025             && h->slice_table[mba_xy] == h->slice_num
5026             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5027             mba_xy += s->mb_stride;
5028         if( MB_FIELD ){
5029             mbb_xy = mb_xy - s->mb_stride;
5030             if( !(mb_y&1)
5031                 && h->slice_table[mbb_xy] == h->slice_num
5032                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5033                 mbb_xy -= s->mb_stride;
5034         }else
5035             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5036     }else{
5037         int mb_xy = h->mb_xy;
5038         mba_xy = mb_xy - 1;
5039         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5040     }
5041
5042     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5043         ctx++;
5044     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5045         ctx++;
5046
5047     if( h->slice_type_nos == FF_B_TYPE )
5048         ctx += 13;
5049     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5050 }
5051
5052 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5053     int mode = 0;
5054
5055     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5056         return pred_mode;
5057
5058     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5059     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5060     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5061
5062     if( mode >= pred_mode )
5063         return mode + 1;
5064     else
5065         return mode;
5066 }
5067
5068 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5069     const int mba_xy = h->left_mb_xy[0];
5070     const int mbb_xy = h->top_mb_xy;
5071
5072     int ctx = 0;
5073
5074     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5075     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5076         ctx++;
5077
5078     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5079         ctx++;
5080
5081     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5082         return 0;
5083
5084     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5085         return 1;
5086     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5087         return 2;
5088     else
5089         return 3;
5090 }
5091
5092 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5093     int cbp_b, cbp_a, ctx, cbp = 0;
5094
5095     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5096     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5097
5098     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5099     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5100     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5101     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5102     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5103     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5104     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5105     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5106     return cbp;
5107 }
5108 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5109     int ctx;
5110     int cbp_a, cbp_b;
5111
5112     cbp_a = (h->left_cbp>>4)&0x03;
5113     cbp_b = (h-> top_cbp>>4)&0x03;
5114
5115     ctx = 0;
5116     if( cbp_a > 0 ) ctx++;
5117     if( cbp_b > 0 ) ctx += 2;
5118     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5119         return 0;
5120
5121     ctx = 4;
5122     if( cbp_a == 2 ) ctx++;
5123     if( cbp_b == 2 ) ctx += 2;
5124     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5125 }
5126 static int decode_cabac_mb_dqp( H264Context *h) {
5127     int   ctx= h->last_qscale_diff != 0;
5128     int   val = 0;
5129
5130     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5131         ctx= 2+(ctx>>1);
5132         val++;
5133         if(val > 102) //prevent infinite loop
5134             return INT_MIN;
5135     }
5136
5137     if( val&0x01 )
5138         return   (val + 1)>>1 ;
5139     else
5140         return -((val + 1)>>1);
5141 }
5142 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5143     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5144         return 0;   /* 8x8 */
5145     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5146         return 1;   /* 8x4 */
5147     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5148         return 2;   /* 4x8 */
5149     return 3;       /* 4x4 */
5150 }
5151 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5152     int type;
5153     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5154         return 0;   /* B_Direct_8x8 */
5155     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5156         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5157     type = 3;
5158     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5159         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5160             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5161         type += 4;
5162     }
5163     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5164     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5165     return type;
5166 }
5167
5168 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5169     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5170 }
5171
5172 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5173     int refa = h->ref_cache[list][scan8[n] - 1];
5174     int refb = h->ref_cache[list][scan8[n] - 8];
5175     int ref  = 0;
5176     int ctx  = 0;
5177
5178     if( h->slice_type_nos == FF_B_TYPE) {
5179         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5180             ctx++;
5181         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5182             ctx += 2;
5183     } else {
5184         if( refa > 0 )
5185             ctx++;
5186         if( refb > 0 )
5187             ctx += 2;
5188     }
5189
5190     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5191         ref++;
5192         ctx = (ctx>>2)+4;
5193         if(ref >= 32 /*h->ref_list[list]*/){
5194             return -1;
5195         }
5196     }
5197     return ref;
5198 }
5199
5200 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5201     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5202                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5203     int ctxbase = (l == 0) ? 40 : 47;
5204     int mvd;
5205     int ctx = (amvd>2) + (amvd>32);
5206
5207     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5208         return 0;
5209
5210     mvd= 1;
5211     ctx= 3;
5212     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5213         mvd++;
5214         if( ctx < 6 )
5215             ctx++;
5216     }
5217
5218     if( mvd >= 9 ) {
5219         int k = 3;
5220         while( get_cabac_bypass( &h->cabac ) ) {
5221             mvd += 1 << k;
5222             k++;
5223             if(k>24){
5224                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5225                 return INT_MIN;
5226             }
5227         }
5228         while( k-- ) {
5229             if( get_cabac_bypass( &h->cabac ) )
5230                 mvd += 1 << k;
5231         }
5232     }
5233     return get_cabac_bypass_sign( &h->cabac, -mvd );
5234 }
5235
5236 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5237     int nza, nzb;
5238     int ctx = 0;
5239
5240     if( is_dc ) {
5241         if( cat == 0 ) {
5242             nza = h->left_cbp&0x100;
5243             nzb = h-> top_cbp&0x100;
5244         } else {
5245             nza = (h->left_cbp>>(6+idx))&0x01;
5246             nzb = (h-> top_cbp>>(6+idx))&0x01;
5247         }
5248     } else {
5249         assert(cat == 1 || cat == 2 || cat == 4);
5250         nza = h->non_zero_count_cache[scan8[idx] - 1];
5251         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5252     }
5253
5254     if( nza > 0 )
5255         ctx++;
5256
5257     if( nzb > 0 )
5258         ctx += 2;
5259
5260     return ctx + 4 * cat;
5261 }
5262
5263 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5264     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5265     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5266     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5267     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5268 };
5269
5270 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5271     static const int significant_coeff_flag_offset[2][6] = {
5272       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5273       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5274     };
5275     static const int last_coeff_flag_offset[2][6] = {
5276       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5277       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5278     };
5279     static const int coeff_abs_level_m1_offset[6] = {
5280         227+0, 227+10, 227+20, 227+30, 227+39, 426
5281     };
5282     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5283       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5284         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5285         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5286        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5287       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5288         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5289         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5290         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5291     };
5292     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5293      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5294      * map node ctx => cabac ctx for level=1 */
5295     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5296     /* map node ctx => cabac ctx for level>1 */
5297     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5298     static const uint8_t coeff_abs_level_transition[2][8] = {
5299     /* update node ctx after decoding a level=1 */
5300         { 1, 2, 3, 3, 4, 5, 6, 7 },
5301     /* update node ctx after decoding a level>1 */
5302         { 4, 4, 4, 4, 5, 6, 7, 7 }
5303     };
5304
5305     int index[64];
5306
5307     int av_unused last;
5308     int coeff_count = 0;
5309     int node_ctx = 0;
5310
5311     uint8_t *significant_coeff_ctx_base;
5312     uint8_t *last_coeff_ctx_base;
5313     uint8_t *abs_level_m1_ctx_base;
5314
5315 #if !ARCH_X86
5316 #define CABAC_ON_STACK
5317 #endif
5318 #ifdef CABAC_ON_STACK
5319 #define CC &cc
5320     CABACContext cc;
5321     cc.range     = h->cabac.range;
5322     cc.low       = h->cabac.low;
5323     cc.bytestream= h->cabac.bytestream;
5324 #else
5325 #define CC &h->cabac
5326 #endif
5327
5328
5329     /* cat: 0-> DC 16x16  n = 0
5330      *      1-> AC 16x16  n = luma4x4idx
5331      *      2-> Luma4x4   n = luma4x4idx
5332      *      3-> DC Chroma n = iCbCr
5333      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5334      *      5-> Luma8x8   n = 4 * luma8x8idx
5335      */
5336
5337     /* read coded block flag */
5338     if( is_dc || cat != 5 ) {
5339         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5340             if( !is_dc )
5341                 h->non_zero_count_cache[scan8[n]] = 0;
5342
5343 #ifdef CABAC_ON_STACK
5344             h->cabac.range     = cc.range     ;
5345             h->cabac.low       = cc.low       ;
5346             h->cabac.bytestream= cc.bytestream;
5347 #endif
5348             return;
5349         }
5350     }
5351
5352     significant_coeff_ctx_base = h->cabac_state
5353         + significant_coeff_flag_offset[MB_FIELD][cat];
5354     last_coeff_ctx_base = h->cabac_state
5355         + last_coeff_flag_offset[MB_FIELD][cat];
5356     abs_level_m1_ctx_base = h->cabac_state
5357         + coeff_abs_level_m1_offset[cat];
5358
5359     if( !is_dc && cat == 5 ) {
5360 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5361         for(last= 0; last < coefs; last++) { \
5362             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5363             if( get_cabac( CC, sig_ctx )) { \
5364                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5365                 index[coeff_count++] = last; \
5366                 if( get_cabac( CC, last_ctx ) ) { \
5367                     last= max_coeff; \
5368                     break; \
5369                 } \
5370             } \
5371         }\
5372         if( last == max_coeff -1 ) {\
5373             index[coeff_count++] = last;\
5374         }
5375         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5376 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5377         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5378     } else {
5379         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5380 #else
5381         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5382     } else {
5383         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5384 #endif
5385     }
5386     assert(coeff_count > 0);
5387
5388     if( is_dc ) {
5389         if( cat == 0 )
5390             h->cbp_table[h->mb_xy] |= 0x100;
5391         else
5392             h->cbp_table[h->mb_xy] |= 0x40 << n;
5393     } else {
5394         if( cat == 5 )
5395             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5396         else {
5397             assert( cat == 1 || cat == 2 || cat == 4 );
5398             h->non_zero_count_cache[scan8[n]] = coeff_count;
5399         }
5400     }
5401
5402     do {
5403         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5404
5405         int j= scantable[index[--coeff_count]];
5406
5407         if( get_cabac( CC, ctx ) == 0 ) {
5408             node_ctx = coeff_abs_level_transition[0][node_ctx];
5409             if( is_dc ) {
5410                 block[j] = get_cabac_bypass_sign( CC, -1);
5411             }else{
5412                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5413             }
5414         } else {
5415             int coeff_abs = 2;
5416             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5417             node_ctx = coeff_abs_level_transition[1][node_ctx];
5418
5419             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5420                 coeff_abs++;
5421             }
5422
5423             if( coeff_abs >= 15 ) {
5424                 int j = 0;
5425                 while( get_cabac_bypass( CC ) ) {
5426                     j++;
5427                 }
5428
5429                 coeff_abs=1;
5430                 while( j-- ) {
5431                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5432                 }
5433                 coeff_abs+= 14;
5434             }
5435
5436             if( is_dc ) {
5437                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5438             }else{
5439                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5440             }
5441         }
5442     } while( coeff_count );
5443 #ifdef CABAC_ON_STACK
5444             h->cabac.range     = cc.range     ;
5445             h->cabac.low       = cc.low       ;
5446             h->cabac.bytestream= cc.bytestream;
5447 #endif
5448
5449 }
5450
5451 #if !CONFIG_SMALL
5452 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5453     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5454 }
5455
5456 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5457     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5458 }
5459 #endif
5460
5461 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5462 #if CONFIG_SMALL
5463     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5464 #else
5465     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5466     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5467 #endif
5468 }
5469
5470 static inline void compute_mb_neighbors(H264Context *h)
5471 {
5472     MpegEncContext * const s = &h->s;
5473     const int mb_xy  = h->mb_xy;
5474     h->top_mb_xy     = mb_xy - s->mb_stride;
5475     h->left_mb_xy[0] = mb_xy - 1;
5476     if(FRAME_MBAFF){
5477         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5478         const int top_pair_xy      = pair_xy     - s->mb_stride;
5479         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5480         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5481         const int curr_mb_field_flag = MB_FIELD;
5482         const int bottom = (s->mb_y & 1);
5483
5484         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5485             h->top_mb_xy -= s->mb_stride;
5486         }
5487         if (!left_mb_field_flag == curr_mb_field_flag) {
5488             h->left_mb_xy[0] = pair_xy - 1;
5489         }
5490     } else if (FIELD_PICTURE) {
5491         h->top_mb_xy -= s->mb_stride;
5492     }
5493     return;
5494 }
5495
5496 /**
5497  * decodes a macroblock
5498  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5499  */
5500 static int decode_mb_cabac(H264Context *h) {
5501     MpegEncContext * const s = &h->s;
5502     int mb_xy;
5503     int mb_type, partition_count, cbp = 0;
5504     int dct8x8_allowed= h->pps.transform_8x8_mode;
5505
5506     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5507
5508     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5509     if( h->slice_type_nos != FF_I_TYPE ) {
5510         int skip;
5511         /* a skipped mb needs the aff flag from the following mb */
5512         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5513             predict_field_decoding_flag(h);
5514         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5515             skip = h->next_mb_skipped;
5516         else
5517             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5518         /* read skip flags */
5519         if( skip ) {
5520             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5521                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5522                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5523                 if(!h->next_mb_skipped)
5524                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5525             }
5526
5527             decode_mb_skip(h);
5528
5529             h->cbp_table[mb_xy] = 0;
5530             h->chroma_pred_mode_table[mb_xy] = 0;
5531             h->last_qscale_diff = 0;
5532
5533             return 0;
5534
5535         }
5536     }
5537     if(FRAME_MBAFF){
5538         if( (s->mb_y&1) == 0 )
5539             h->mb_mbaff =
5540             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5541     }
5542
5543     h->prev_mb_skipped = 0;
5544
5545     compute_mb_neighbors(h);
5546
5547     if( h->slice_type_nos == FF_B_TYPE ) {
5548         mb_type = decode_cabac_mb_type_b( h );
5549         if( mb_type < 23 ){
5550             partition_count= b_mb_type_info[mb_type].partition_count;
5551             mb_type=         b_mb_type_info[mb_type].type;
5552         }else{
5553             mb_type -= 23;
5554             goto decode_intra_mb;
5555         }
5556     } else if( h->slice_type_nos == FF_P_TYPE ) {
5557         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5558             /* P-type */
5559             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5560                 /* P_L0_D16x16, P_8x8 */
5561                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5562             } else {
5563                 /* P_L0_D8x16, P_L0_D16x8 */
5564                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5565             }
5566             partition_count= p_mb_type_info[mb_type].partition_count;
5567             mb_type=         p_mb_type_info[mb_type].type;
5568         } else {
5569             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5570             goto decode_intra_mb;
5571         }
5572     } else {
5573         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5574         if(h->slice_type == FF_SI_TYPE && mb_type)
5575             mb_type--;
5576         assert(h->slice_type_nos == FF_I_TYPE);
5577 decode_intra_mb:
5578         partition_count = 0;
5579         cbp= i_mb_type_info[mb_type].cbp;
5580         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5581         mb_type= i_mb_type_info[mb_type].type;
5582     }
5583     if(MB_FIELD)
5584         mb_type |= MB_TYPE_INTERLACED;
5585
5586     h->slice_table[ mb_xy ]= h->slice_num;
5587
5588     if(IS_INTRA_PCM(mb_type)) {
5589         const uint8_t *ptr;
5590
5591         // We assume these blocks are very rare so we do not optimize it.
5592         // FIXME The two following lines get the bitstream position in the cabac
5593         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5594         ptr= h->cabac.bytestream;
5595         if(h->cabac.low&0x1) ptr--;
5596         if(CABAC_BITS==16){
5597             if(h->cabac.low&0x1FF) ptr--;
5598         }
5599
5600         // The pixels are stored in the same order as levels in h->mb array.
5601         memcpy(h->mb, ptr, 256); ptr+=256;
5602         if(CHROMA){
5603             memcpy(h->mb+128, ptr, 128); ptr+=128;
5604         }
5605
5606         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5607
5608         // All blocks are present
5609         h->cbp_table[mb_xy] = 0x1ef;
5610         h->chroma_pred_mode_table[mb_xy] = 0;
5611         // In deblocking, the quantizer is 0
5612         s->current_picture.qscale_table[mb_xy]= 0;
5613         // All coeffs are present
5614         memset(h->non_zero_count[mb_xy], 16, 16);
5615         s->current_picture.mb_type[mb_xy]= mb_type;
5616         h->last_qscale_diff = 0;
5617         return 0;
5618     }
5619
5620     if(MB_MBAFF){
5621         h->ref_count[0] <<= 1;
5622         h->ref_count[1] <<= 1;
5623     }
5624
5625     fill_caches(h, mb_type, 0);
5626
5627     if( IS_INTRA( mb_type ) ) {
5628         int i, pred_mode;
5629         if( IS_INTRA4x4( mb_type ) ) {
5630             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5631                 mb_type |= MB_TYPE_8x8DCT;
5632                 for( i = 0; i < 16; i+=4 ) {
5633                     int pred = pred_intra_mode( h, i );
5634                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5635                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5636                 }
5637             } else {
5638                 for( i = 0; i < 16; i++ ) {
5639                     int pred = pred_intra_mode( h, i );
5640                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5641
5642                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5643                 }
5644             }
5645             write_back_intra_pred_mode(h);
5646             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5647         } else {
5648             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5649             if( h->intra16x16_pred_mode < 0 ) return -1;
5650         }
5651         if(CHROMA){
5652             h->chroma_pred_mode_table[mb_xy] =
5653             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5654
5655             pred_mode= check_intra_pred_mode( h, pred_mode );
5656             if( pred_mode < 0 ) return -1;
5657             h->chroma_pred_mode= pred_mode;
5658         }
5659     } else if( partition_count == 4 ) {
5660         int i, j, sub_partition_count[4], list, ref[2][4];
5661
5662         if( h->slice_type_nos == FF_B_TYPE ) {
5663             for( i = 0; i < 4; i++ ) {
5664                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5665                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5666                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5667             }
5668             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5669                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5670                 pred_direct_motion(h, &mb_type);
5671                 h->ref_cache[0][scan8[4]] =
5672                 h->ref_cache[1][scan8[4]] =
5673                 h->ref_cache[0][scan8[12]] =
5674                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5675                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5676                     for( i = 0; i < 4; i++ )
5677                         if( IS_DIRECT(h->sub_mb_type[i]) )
5678                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5679                 }
5680             }
5681         } else {
5682             for( i = 0; i < 4; i++ ) {
5683                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5684                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5685                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5686             }
5687         }
5688
5689         for( list = 0; list < h->list_count; list++ ) {
5690                 for( i = 0; i < 4; i++ ) {
5691                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5692                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5693                         if( h->ref_count[list] > 1 ){
5694                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5695                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5696                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5697                                 return -1;
5698                             }
5699                         }else
5700                             ref[list][i] = 0;
5701                     } else {
5702                         ref[list][i] = -1;
5703                     }
5704                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5705                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5706                 }
5707         }
5708
5709         if(dct8x8_allowed)
5710             dct8x8_allowed = get_dct8x8_allowed(h);
5711
5712         for(list=0; list<h->list_count; list++){
5713             for(i=0; i<4; i++){
5714                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5715                 if(IS_DIRECT(h->sub_mb_type[i])){
5716                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5717                     continue;
5718                 }
5719
5720                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5721                     const int sub_mb_type= h->sub_mb_type[i];
5722                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5723                     for(j=0; j<sub_partition_count[i]; j++){
5724                         int mpx, mpy;
5725                         int mx, my;
5726                         const int index= 4*i + block_width*j;
5727                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5728                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5729                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5730
5731                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5732                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5733                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5734
5735                         if(IS_SUB_8X8(sub_mb_type)){
5736                             mv_cache[ 1 ][0]=
5737                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5738                             mv_cache[ 1 ][1]=
5739                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5740
5741                             mvd_cache[ 1 ][0]=
5742                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5743                             mvd_cache[ 1 ][1]=
5744                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5745                         }else if(IS_SUB_8X4(sub_mb_type)){
5746                             mv_cache[ 1 ][0]= mx;
5747                             mv_cache[ 1 ][1]= my;
5748
5749                             mvd_cache[ 1 ][0]= mx - mpx;
5750                             mvd_cache[ 1 ][1]= my - mpy;
5751                         }else if(IS_SUB_4X8(sub_mb_type)){
5752                             mv_cache[ 8 ][0]= mx;
5753                             mv_cache[ 8 ][1]= my;
5754
5755                             mvd_cache[ 8 ][0]= mx - mpx;
5756                             mvd_cache[ 8 ][1]= my - mpy;
5757                         }
5758                         mv_cache[ 0 ][0]= mx;
5759                         mv_cache[ 0 ][1]= my;
5760
5761                         mvd_cache[ 0 ][0]= mx - mpx;
5762                         mvd_cache[ 0 ][1]= my - mpy;
5763                     }
5764                 }else{
5765                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5766                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5767                     p[0] = p[1] = p[8] = p[9] = 0;
5768                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5769                 }
5770             }
5771         }
5772     } else if( IS_DIRECT(mb_type) ) {
5773         pred_direct_motion(h, &mb_type);
5774         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5775         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5776         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5777     } else {
5778         int list, mx, my, i, mpx, mpy;
5779         if(IS_16X16(mb_type)){
5780             for(list=0; list<h->list_count; list++){
5781                 if(IS_DIR(mb_type, 0, list)){
5782                     int ref;
5783                     if(h->ref_count[list] > 1){
5784                         ref= decode_cabac_mb_ref(h, list, 0);
5785                         if(ref >= (unsigned)h->ref_count[list]){
5786                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5787                             return -1;
5788                         }
5789                     }else
5790                         ref=0;
5791                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5792                 }else
5793                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5794             }
5795             for(list=0; list<h->list_count; list++){
5796                 if(IS_DIR(mb_type, 0, list)){
5797                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5798
5799                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5800                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5801                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5802
5803                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5804                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5805                 }else
5806                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5807             }
5808         }
5809         else if(IS_16X8(mb_type)){
5810             for(list=0; list<h->list_count; list++){
5811                     for(i=0; i<2; i++){
5812                         if(IS_DIR(mb_type, i, list)){
5813                             int ref;
5814                             if(h->ref_count[list] > 1){
5815                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5816                                 if(ref >= (unsigned)h->ref_count[list]){
5817                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5818                                     return -1;
5819                                 }
5820                             }else
5821                                 ref=0;
5822                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5823                         }else
5824                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5825                     }
5826             }
5827             for(list=0; list<h->list_count; list++){
5828                 for(i=0; i<2; i++){
5829                     if(IS_DIR(mb_type, i, list)){
5830                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5831                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5832                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5833                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5834
5835                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5836                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5837                     }else{
5838                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5839                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5840                     }
5841                 }
5842             }
5843         }else{
5844             assert(IS_8X16(mb_type));
5845             for(list=0; list<h->list_count; list++){
5846                     for(i=0; i<2; i++){
5847                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5848                             int ref;
5849                             if(h->ref_count[list] > 1){
5850                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5851                                 if(ref >= (unsigned)h->ref_count[list]){
5852                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5853                                     return -1;
5854                                 }
5855                             }else
5856                                 ref=0;
5857                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5858                         }else
5859                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5860                     }
5861             }
5862             for(list=0; list<h->list_count; list++){
5863                 for(i=0; i<2; i++){
5864                     if(IS_DIR(mb_type, i, list)){
5865                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5866                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5867                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5868
5869                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5870                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5871                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5872                     }else{
5873                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5874                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5875                     }
5876                 }
5877             }
5878         }
5879     }
5880
5881    if( IS_INTER( mb_type ) ) {
5882         h->chroma_pred_mode_table[mb_xy] = 0;
5883         write_back_motion( h, mb_type );
5884    }
5885
5886     if( !IS_INTRA16x16( mb_type ) ) {
5887         cbp  = decode_cabac_mb_cbp_luma( h );
5888         if(CHROMA)
5889             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5890     }
5891
5892     h->cbp_table[mb_xy] = h->cbp = cbp;
5893
5894     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5895         if( decode_cabac_mb_transform_size( h ) )
5896             mb_type |= MB_TYPE_8x8DCT;
5897     }
5898     s->current_picture.mb_type[mb_xy]= mb_type;
5899
5900     if( cbp || IS_INTRA16x16( mb_type ) ) {
5901         const uint8_t *scan, *scan8x8, *dc_scan;
5902         const uint32_t *qmul;
5903         int dqp;
5904
5905         if(IS_INTERLACED(mb_type)){
5906             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5907             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5908             dc_scan= luma_dc_field_scan;
5909         }else{
5910             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5911             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5912             dc_scan= luma_dc_zigzag_scan;
5913         }
5914
5915         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5916         if( dqp == INT_MIN ){
5917             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5918             return -1;
5919         }
5920         s->qscale += dqp;
5921         if(((unsigned)s->qscale) > 51){
5922             if(s->qscale<0) s->qscale+= 52;
5923             else            s->qscale-= 52;
5924         }
5925         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5926         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5927
5928         if( IS_INTRA16x16( mb_type ) ) {
5929             int i;
5930             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5931             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5932
5933             if( cbp&15 ) {
5934                 qmul = h->dequant4_coeff[0][s->qscale];
5935                 for( i = 0; i < 16; i++ ) {
5936                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5937                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5938                 }
5939             } else {
5940                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5941             }
5942         } else {
5943             int i8x8, i4x4;
5944             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5945                 if( cbp & (1<<i8x8) ) {
5946                     if( IS_8x8DCT(mb_type) ) {
5947                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5948                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5949                     } else {
5950                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5951                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5952                             const int index = 4*i8x8 + i4x4;
5953                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5954 //START_TIMER
5955                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5956 //STOP_TIMER("decode_residual")
5957                         }
5958                     }
5959                 } else {
5960                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5961                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5962                 }
5963             }
5964         }
5965
5966         if( cbp&0x30 ){
5967             int c;
5968             for( c = 0; c < 2; c++ ) {
5969                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5970                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5971             }
5972         }
5973
5974         if( cbp&0x20 ) {
5975             int c, i;
5976             for( c = 0; c < 2; c++ ) {
5977                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5978                 for( i = 0; i < 4; i++ ) {
5979                     const int index = 16 + 4 * c + i;
5980                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5981                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5982                 }
5983             }
5984         } else {
5985             uint8_t * const nnz= &h->non_zero_count_cache[0];
5986             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5987             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5988         }
5989     } else {
5990         uint8_t * const nnz= &h->non_zero_count_cache[0];
5991         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5992         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5993         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5994         h->last_qscale_diff = 0;
5995     }
5996
5997     s->current_picture.qscale_table[mb_xy]= s->qscale;
5998     write_back_non_zero_count(h);
5999
6000     if(MB_MBAFF){
6001         h->ref_count[0] >>= 1;
6002         h->ref_count[1] >>= 1;
6003     }
6004
6005     return 0;
6006 }
6007
6008
6009 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6010     const int index_a = qp + h->slice_alpha_c0_offset;
6011     const int alpha = (alpha_table+52)[index_a];
6012     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6013     if (alpha ==0 || beta == 0) return;
6014
6015     if( bS[0] < 4 ) {
6016         int8_t tc[4];
6017         tc[0] = (tc0_table+52)[index_a][bS[0]];
6018         tc[1] = (tc0_table+52)[index_a][bS[1]];
6019         tc[2] = (tc0_table+52)[index_a][bS[2]];
6020         tc[3] = (tc0_table+52)[index_a][bS[3]];
6021         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6022     } else {
6023         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6024     }
6025 }
6026 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6027     const int index_a = qp + h->slice_alpha_c0_offset;
6028     const int alpha = (alpha_table+52)[index_a];
6029     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6030     if (alpha ==0 || beta == 0) return;
6031
6032     if( bS[0] < 4 ) {
6033         int8_t tc[4];
6034         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6035         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6036         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6037         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6038         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6039     } else {
6040         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6041     }
6042 }
6043
6044 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6045     int i;
6046     for( i = 0; i < 16; i++, pix += stride) {
6047         int index_a;
6048         int alpha;
6049         int beta;
6050
6051         int qp_index;
6052         int bS_index = (i >> 1);
6053         if (!MB_FIELD) {
6054             bS_index &= ~1;
6055             bS_index |= (i & 1);
6056         }
6057
6058         if( bS[bS_index] == 0 ) {
6059             continue;
6060         }
6061
6062         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6063         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6064         alpha = (alpha_table+52)[index_a];
6065         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6066
6067         if( bS[bS_index] < 4 ) {
6068             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6069             const int p0 = pix[-1];
6070             const int p1 = pix[-2];
6071             const int p2 = pix[-3];
6072             const int q0 = pix[0];
6073             const int q1 = pix[1];
6074             const int q2 = pix[2];
6075
6076             if( FFABS( p0 - q0 ) < alpha &&
6077                 FFABS( p1 - p0 ) < beta &&
6078                 FFABS( q1 - q0 ) < beta ) {
6079                 int tc = tc0;
6080                 int i_delta;
6081
6082                 if( FFABS( p2 - p0 ) < beta ) {
6083                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6084                     tc++;
6085                 }
6086                 if( FFABS( q2 - q0 ) < beta ) {
6087                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6088                     tc++;
6089                 }
6090
6091                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6092                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6093                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6094                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6095             }
6096         }else{
6097             const int p0 = pix[-1];
6098             const int p1 = pix[-2];
6099             const int p2 = pix[-3];
6100
6101             const int q0 = pix[0];
6102             const int q1 = pix[1];
6103             const int q2 = pix[2];
6104
6105             if( FFABS( p0 - q0 ) < alpha &&
6106                 FFABS( p1 - p0 ) < beta &&
6107                 FFABS( q1 - q0 ) < beta ) {
6108
6109                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6110                     if( FFABS( p2 - p0 ) < beta)
6111                     {
6112                         const int p3 = pix[-4];
6113                         /* p0', p1', p2' */
6114                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6115                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6116                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6117                     } else {
6118                         /* p0' */
6119                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6120                     }
6121                     if( FFABS( q2 - q0 ) < beta)
6122                     {
6123                         const int q3 = pix[3];
6124                         /* q0', q1', q2' */
6125                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6126                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6127                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6128                     } else {
6129                         /* q0' */
6130                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6131                     }
6132                 }else{
6133                     /* p0', q0' */
6134                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6135                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6136                 }
6137                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6138             }
6139         }
6140     }
6141 }
6142 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6143     int i;
6144     for( i = 0; i < 8; i++, pix += stride) {
6145         int index_a;
6146         int alpha;
6147         int beta;
6148
6149         int qp_index;
6150         int bS_index = i;
6151
6152         if( bS[bS_index] == 0 ) {
6153             continue;
6154         }
6155
6156         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6157         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6158         alpha = (alpha_table+52)[index_a];
6159         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6160
6161         if( bS[bS_index] < 4 ) {
6162             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6163             const int p0 = pix[-1];
6164             const int p1 = pix[-2];
6165             const int q0 = pix[0];
6166             const int q1 = pix[1];
6167
6168             if( FFABS( p0 - q0 ) < alpha &&
6169                 FFABS( p1 - p0 ) < beta &&
6170                 FFABS( q1 - q0 ) < beta ) {
6171                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6172
6173                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6174                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6175                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6176             }
6177         }else{
6178             const int p0 = pix[-1];
6179             const int p1 = pix[-2];
6180             const int q0 = pix[0];
6181             const int q1 = pix[1];
6182
6183             if( FFABS( p0 - q0 ) < alpha &&
6184                 FFABS( p1 - p0 ) < beta &&
6185                 FFABS( q1 - q0 ) < beta ) {
6186
6187                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6188                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6189                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6190             }
6191         }
6192     }
6193 }
6194
6195 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6196     const int index_a = qp + h->slice_alpha_c0_offset;
6197     const int alpha = (alpha_table+52)[index_a];
6198     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6199     if (alpha ==0 || beta == 0) return;
6200
6201     if( bS[0] < 4 ) {
6202         int8_t tc[4];
6203         tc[0] = (tc0_table+52)[index_a][bS[0]];
6204         tc[1] = (tc0_table+52)[index_a][bS[1]];
6205         tc[2] = (tc0_table+52)[index_a][bS[2]];
6206         tc[3] = (tc0_table+52)[index_a][bS[3]];
6207         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6208     } else {
6209         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6210     }
6211 }
6212
6213 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6214     const int index_a = qp + h->slice_alpha_c0_offset;
6215     const int alpha = (alpha_table+52)[index_a];
6216     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6217     if (alpha ==0 || beta == 0) return;
6218
6219     if( bS[0] < 4 ) {
6220         int8_t tc[4];
6221         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6222         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6223         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6224         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6225         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6226     } else {
6227         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6228     }
6229 }
6230
6231 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6232     MpegEncContext * const s = &h->s;
6233     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6234     int mb_xy, mb_type;
6235     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6236
6237     mb_xy = h->mb_xy;
6238
6239     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6240         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6241        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6242                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6243         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6244         return;
6245     }
6246     assert(!FRAME_MBAFF);
6247
6248     mb_type = s->current_picture.mb_type[mb_xy];
6249     qp = s->current_picture.qscale_table[mb_xy];
6250     qp0 = s->current_picture.qscale_table[mb_xy-1];
6251     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6252     qpc = get_chroma_qp( h, 0, qp );
6253     qpc0 = get_chroma_qp( h, 0, qp0 );
6254     qpc1 = get_chroma_qp( h, 0, qp1 );
6255     qp0 = (qp + qp0 + 1) >> 1;
6256     qp1 = (qp + qp1 + 1) >> 1;
6257     qpc0 = (qpc + qpc0 + 1) >> 1;
6258     qpc1 = (qpc + qpc1 + 1) >> 1;
6259     qp_thresh = 15 - h->slice_alpha_c0_offset;
6260     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6261        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6262         return;
6263
6264     if( IS_INTRA(mb_type) ) {
6265         int16_t bS4[4] = {4,4,4,4};
6266         int16_t bS3[4] = {3,3,3,3};
6267         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6268         if( IS_8x8DCT(mb_type) ) {
6269             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6270             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6271             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6272             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6273         } else {
6274             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6275             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6276             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6277             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6278             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6279             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6280             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6281             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6282         }
6283         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6284         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6285         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6286         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6287         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6288         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6289         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6290         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6291         return;
6292     } else {
6293         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6294         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6295         int edges;
6296         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6297             edges = 4;
6298             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6299         } else {
6300             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6301                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6302             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6303                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6304                              ? 3 : 0;
6305             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6306             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6307             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6308                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6309         }
6310         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6311             bSv[0][0] = 0x0004000400040004ULL;
6312         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6313             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6314
6315 #define FILTER(hv,dir,edge)\
6316         if(bSv[dir][edge]) {\
6317             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6318             if(!(edge&1)) {\
6319                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6320                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6321             }\
6322         }
6323         if( edges == 1 ) {
6324             FILTER(v,0,0);
6325             FILTER(h,1,0);
6326         } else if( IS_8x8DCT(mb_type) ) {
6327             FILTER(v,0,0);
6328             FILTER(v,0,2);
6329             FILTER(h,1,0);
6330             FILTER(h,1,2);
6331         } else {
6332             FILTER(v,0,0);
6333             FILTER(v,0,1);
6334             FILTER(v,0,2);
6335             FILTER(v,0,3);
6336             FILTER(h,1,0);
6337             FILTER(h,1,1);
6338             FILTER(h,1,2);
6339             FILTER(h,1,3);
6340         }
6341 #undef FILTER
6342     }
6343 }
6344
6345
6346 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6347     MpegEncContext * const s = &h->s;
6348     int edge;
6349     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6350     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6351     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6352     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6353     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6354
6355     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6356                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6357     // how often to recheck mv-based bS when iterating between edges
6358     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6359                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6360     // how often to recheck mv-based bS when iterating along each edge
6361     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6362
6363     if (first_vertical_edge_done) {
6364         start = 1;
6365     }
6366
6367     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6368         start = 1;
6369
6370     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6371         && !IS_INTERLACED(mb_type)
6372         && IS_INTERLACED(mbm_type)
6373         ) {
6374         // This is a special case in the norm where the filtering must
6375         // be done twice (one each of the field) even if we are in a
6376         // frame macroblock.
6377         //
6378         static const int nnz_idx[4] = {4,5,6,3};
6379         unsigned int tmp_linesize   = 2 *   linesize;
6380         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6381         int mbn_xy = mb_xy - 2 * s->mb_stride;
6382         int qp;
6383         int i, j;
6384         int16_t bS[4];
6385
6386         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6387             if( IS_INTRA(mb_type) ||
6388                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6389                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6390             } else {
6391                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6392                 for( i = 0; i < 4; i++ ) {
6393                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6394                         mbn_nnz[nnz_idx[i]] != 0 )
6395                         bS[i] = 2;
6396                     else
6397                         bS[i] = 1;
6398                 }
6399             }
6400             // Do not use s->qscale as luma quantizer because it has not the same
6401             // value in IPCM macroblocks.
6402             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6403             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6404             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6405             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6406             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6407                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6408             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6409                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6410         }
6411
6412         start = 1;
6413     }
6414
6415     /* Calculate bS */
6416     for( edge = start; edge < edges; edge++ ) {
6417         /* mbn_xy: neighbor macroblock */
6418         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6419         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6420         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6421         int16_t bS[4];
6422         int qp;
6423
6424         if( (edge&1) && IS_8x8DCT(mb_type) )
6425             continue;
6426
6427         if( IS_INTRA(mb_type) ||
6428             IS_INTRA(mbn_type) ) {
6429             int value;
6430             if (edge == 0) {
6431                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6432                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6433                 ) {
6434                     value = 4;
6435                 } else {
6436                     value = 3;
6437                 }
6438             } else {
6439                 value = 3;
6440             }
6441             bS[0] = bS[1] = bS[2] = bS[3] = value;
6442         } else {
6443             int i, l;
6444             int mv_done;
6445
6446             if( edge & mask_edge ) {
6447                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6448                 mv_done = 1;
6449             }
6450             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6451                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6452                 mv_done = 1;
6453             }
6454             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6455                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6456                 int bn_idx= b_idx - (dir ? 8:1);
6457                 int v = 0;
6458
6459                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6460                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6461                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6462                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6463                 }
6464
6465                 if(h->slice_type_nos == FF_B_TYPE && v){
6466                     v=0;
6467                     for( l = 0; !v && l < 2; l++ ) {
6468                         int ln= 1-l;
6469                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6470                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6471                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6472                     }
6473                 }
6474
6475                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6476                 mv_done = 1;
6477             }
6478             else
6479                 mv_done = 0;
6480
6481             for( i = 0; i < 4; i++ ) {
6482                 int x = dir == 0 ? edge : i;
6483                 int y = dir == 0 ? i    : edge;
6484                 int b_idx= 8 + 4 + x + 8*y;
6485                 int bn_idx= b_idx - (dir ? 8:1);
6486
6487                 if( h->non_zero_count_cache[b_idx] |
6488                     h->non_zero_count_cache[bn_idx] ) {
6489                     bS[i] = 2;
6490                 }
6491                 else if(!mv_done)
6492                 {
6493                     bS[i] = 0;
6494                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6495                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6496                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6497                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6498                             bS[i] = 1;
6499                             break;
6500                         }
6501                     }
6502
6503                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6504                         bS[i] = 0;
6505                         for( l = 0; l < 2; l++ ) {
6506                             int ln= 1-l;
6507                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6508                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6509                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6510                                 bS[i] = 1;
6511                                 break;
6512                             }
6513                         }
6514                     }
6515                 }
6516             }
6517
6518             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6519                 continue;
6520         }
6521
6522         /* Filter edge */
6523         // Do not use s->qscale as luma quantizer because it has not the same
6524         // value in IPCM macroblocks.
6525         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6526         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6527         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6528         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6529         if( dir == 0 ) {
6530             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6531             if( (edge&1) == 0 ) {
6532                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6533                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6534                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6535                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6536             }
6537         } else {
6538             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6539             if( (edge&1) == 0 ) {
6540                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6541                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6542                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6543                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6544             }
6545         }
6546     }
6547 }
6548
6549 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6550     MpegEncContext * const s = &h->s;
6551     const int mb_xy= mb_x + mb_y*s->mb_stride;
6552     const int mb_type = s->current_picture.mb_type[mb_xy];
6553     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6554     int first_vertical_edge_done = 0;
6555     av_unused int dir;
6556
6557     //for sufficiently low qp, filtering wouldn't do anything
6558     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6559     if(!FRAME_MBAFF){
6560         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6561         int qp = s->current_picture.qscale_table[mb_xy];
6562         if(qp <= qp_thresh
6563            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6564            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6565             return;
6566         }
6567     }
6568
6569     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6570     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6571         int top_type, left_type[2];
6572         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6573         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6574         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6575
6576         if(IS_8x8DCT(top_type)){
6577             h->non_zero_count_cache[4+8*0]=
6578             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6579             h->non_zero_count_cache[6+8*0]=
6580             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6581         }
6582         if(IS_8x8DCT(left_type[0])){
6583             h->non_zero_count_cache[3+8*1]=
6584             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6585         }
6586         if(IS_8x8DCT(left_type[1])){
6587             h->non_zero_count_cache[3+8*3]=
6588             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6589         }
6590
6591         if(IS_8x8DCT(mb_type)){
6592             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6593             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6594
6595             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6596             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6597
6598             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6599             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6600
6601             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6602             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6603         }
6604     }
6605
6606     if (FRAME_MBAFF
6607             // left mb is in picture
6608             && h->slice_table[mb_xy-1] != 0xFFFF
6609             // and current and left pair do not have the same interlaced type
6610             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6611             // and left mb is in the same slice if deblocking_filter == 2
6612             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6613         /* First vertical edge is different in MBAFF frames
6614          * There are 8 different bS to compute and 2 different Qp
6615          */
6616         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6617         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6618         int16_t bS[8];
6619         int qp[2];
6620         int bqp[2];
6621         int rqp[2];
6622         int mb_qp, mbn0_qp, mbn1_qp;
6623         int i;
6624         first_vertical_edge_done = 1;
6625
6626         if( IS_INTRA(mb_type) )
6627             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6628         else {
6629             for( i = 0; i < 8; i++ ) {
6630                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6631
6632                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6633                     bS[i] = 4;
6634                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6635                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6636                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6637                                                                        :
6638                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6639                     bS[i] = 2;
6640                 else
6641                     bS[i] = 1;
6642             }
6643         }
6644
6645         mb_qp = s->current_picture.qscale_table[mb_xy];
6646         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6647         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6648         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6649         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6650                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6651         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6652                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6653         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6654         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6655                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6656         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6657                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6658
6659         /* Filter edge */
6660         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6661         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6662         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6663         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6664         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6665     }
6666
6667 #if CONFIG_SMALL
6668     for( dir = 0; dir < 2; dir++ )
6669         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6670 #else
6671     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6672     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6673 #endif
6674 }
6675
6676 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6677     H264Context *h = *(void**)arg;
6678     MpegEncContext * const s = &h->s;
6679     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6680
6681     s->mb_skip_run= -1;
6682
6683     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6684                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6685
6686     if( h->pps.cabac ) {
6687         int i;
6688
6689         /* realign */
6690         align_get_bits( &s->gb );
6691
6692         /* init cabac */
6693         ff_init_cabac_states( &h->cabac);
6694         ff_init_cabac_decoder( &h->cabac,
6695                                s->gb.buffer + get_bits_count(&s->gb)/8,
6696                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6697         /* calculate pre-state */
6698         for( i= 0; i < 460; i++ ) {
6699             int pre;
6700             if( h->slice_type_nos == FF_I_TYPE )
6701                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6702             else
6703                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6704
6705             if( pre <= 63 )
6706                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6707             else
6708                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6709         }
6710
6711         for(;;){
6712 //START_TIMER
6713             int ret = decode_mb_cabac(h);
6714             int eos;
6715 //STOP_TIMER("decode_mb_cabac")
6716
6717             if(ret>=0) hl_decode_mb(h);
6718
6719             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6720                 s->mb_y++;
6721
6722                 ret = decode_mb_cabac(h);
6723
6724                 if(ret>=0) hl_decode_mb(h);
6725                 s->mb_y--;
6726             }
6727             eos = get_cabac_terminate( &h->cabac );
6728
6729             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6730                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6731                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6732                 return -1;
6733             }
6734
6735             if( ++s->mb_x >= s->mb_width ) {
6736                 s->mb_x = 0;
6737                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6738                 ++s->mb_y;
6739                 if(FIELD_OR_MBAFF_PICTURE) {
6740                     ++s->mb_y;
6741                 }
6742             }
6743
6744             if( eos || s->mb_y >= s->mb_height ) {
6745                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6746                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6747                 return 0;
6748             }
6749         }
6750
6751     } else {
6752         for(;;){
6753             int ret = decode_mb_cavlc(h);
6754
6755             if(ret>=0) hl_decode_mb(h);
6756
6757             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6758                 s->mb_y++;
6759                 ret = decode_mb_cavlc(h);
6760
6761                 if(ret>=0) hl_decode_mb(h);
6762                 s->mb_y--;
6763             }
6764
6765             if(ret<0){
6766                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6767                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6768
6769                 return -1;
6770             }
6771
6772             if(++s->mb_x >= s->mb_width){
6773                 s->mb_x=0;
6774                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6775                 ++s->mb_y;
6776                 if(FIELD_OR_MBAFF_PICTURE) {
6777                     ++s->mb_y;
6778                 }
6779                 if(s->mb_y >= s->mb_height){
6780                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6781
6782                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6783                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6784
6785                         return 0;
6786                     }else{
6787                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6788
6789                         return -1;
6790                     }
6791                 }
6792             }
6793
6794             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6795                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6796                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6797                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6798
6799                     return 0;
6800                 }else{
6801                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6802
6803                     return -1;
6804                 }
6805             }
6806         }
6807     }
6808
6809 #if 0
6810     for(;s->mb_y < s->mb_height; s->mb_y++){
6811         for(;s->mb_x < s->mb_width; s->mb_x++){
6812             int ret= decode_mb(h);
6813
6814             hl_decode_mb(h);
6815
6816             if(ret<0){
6817                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6818                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6819
6820                 return -1;
6821             }
6822
6823             if(++s->mb_x >= s->mb_width){
6824                 s->mb_x=0;
6825                 if(++s->mb_y >= s->mb_height){
6826                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6827                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6828
6829                         return 0;
6830                     }else{
6831                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6832
6833                         return -1;
6834                     }
6835                 }
6836             }
6837
6838             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6839                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6840                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6841
6842                     return 0;
6843                 }else{
6844                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6845
6846                     return -1;
6847                 }
6848             }
6849         }
6850         s->mb_x=0;
6851         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6852     }
6853 #endif
6854     return -1; //not reached
6855 }
6856
6857 static int decode_picture_timing(H264Context *h){
6858     MpegEncContext * const s = &h->s;
6859     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6860         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6861         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6862     }
6863     if(h->sps.pic_struct_present_flag){
6864         unsigned int i, num_clock_ts;
6865         h->sei_pic_struct = get_bits(&s->gb, 4);
6866         h->sei_ct_type    = 0;
6867
6868         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6869             return -1;
6870
6871         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6872
6873         for (i = 0 ; i < num_clock_ts ; i++){
6874             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6875                 unsigned int full_timestamp_flag;
6876                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6877                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6878                 skip_bits(&s->gb, 5);                 /* counting_type */
6879                 full_timestamp_flag = get_bits(&s->gb, 1);
6880                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6881                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6882                 skip_bits(&s->gb, 8);                 /* n_frames */
6883                 if(full_timestamp_flag){
6884                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6885                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6886                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6887                 }else{
6888                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6889                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6890                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6891                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6892                             if(get_bits(&s->gb, 1))   /* hours_flag */
6893                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6894                         }
6895                     }
6896                 }
6897                 if(h->sps.time_offset_length > 0)
6898                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6899             }
6900         }
6901
6902         if(s->avctx->debug & FF_DEBUG_PICT_INFO)
6903             av_log(s->avctx, AV_LOG_DEBUG, "ct_type:%X pic_struct:%d\n", h->sei_ct_type, h->sei_pic_struct);
6904     }
6905     return 0;
6906 }
6907
6908 static int decode_unregistered_user_data(H264Context *h, int size){
6909     MpegEncContext * const s = &h->s;
6910     uint8_t user_data[16+256];
6911     int e, build, i;
6912
6913     if(size<16)
6914         return -1;
6915
6916     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6917         user_data[i]= get_bits(&s->gb, 8);
6918     }
6919
6920     user_data[i]= 0;
6921     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6922     if(e==1 && build>=0)
6923         h->x264_build= build;
6924
6925     if(s->avctx->debug & FF_DEBUG_BUGS)
6926         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6927
6928     for(; i<size; i++)
6929         skip_bits(&s->gb, 8);
6930
6931     return 0;
6932 }
6933
6934 static int decode_recovery_point(H264Context *h){
6935     MpegEncContext * const s = &h->s;
6936
6937     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6938     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6939
6940     return 0;
6941 }
6942
6943 static int decode_buffering_period(H264Context *h){
6944     MpegEncContext * const s = &h->s;
6945     unsigned int sps_id;
6946     int sched_sel_idx;
6947     SPS *sps;
6948
6949     sps_id = get_ue_golomb_31(&s->gb);
6950     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6951         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6952         return -1;
6953     }
6954     sps = h->sps_buffers[sps_id];
6955
6956     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6957     if (sps->nal_hrd_parameters_present_flag) {
6958         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6959             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6960             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6961         }
6962     }
6963     if (sps->vcl_hrd_parameters_present_flag) {
6964         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6965             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6966             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6967         }
6968     }
6969
6970     h->sei_buffering_period_present = 1;
6971     return 0;
6972 }
6973
6974 int ff_h264_decode_sei(H264Context *h){
6975     MpegEncContext * const s = &h->s;
6976
6977     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6978         int size, type;
6979
6980         type=0;
6981         do{
6982             type+= show_bits(&s->gb, 8);
6983         }while(get_bits(&s->gb, 8) == 255);
6984
6985         size=0;
6986         do{
6987             size+= show_bits(&s->gb, 8);
6988         }while(get_bits(&s->gb, 8) == 255);
6989
6990         switch(type){
6991         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6992             if(decode_picture_timing(h) < 0)
6993                 return -1;
6994             break;
6995         case SEI_TYPE_USER_DATA_UNREGISTERED:
6996             if(decode_unregistered_user_data(h, size) < 0)
6997                 return -1;
6998             break;
6999         case SEI_TYPE_RECOVERY_POINT:
7000             if(decode_recovery_point(h) < 0)
7001                 return -1;
7002             break;
7003         case SEI_BUFFERING_PERIOD:
7004             if(decode_buffering_period(h) < 0)
7005                 return -1;
7006             break;
7007         default:
7008             skip_bits(&s->gb, 8*size);
7009         }
7010
7011         //FIXME check bits here
7012         align_get_bits(&s->gb);
7013     }
7014
7015     return 0;
7016 }
7017
7018 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7019     MpegEncContext * const s = &h->s;
7020     int cpb_count, i;
7021     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7022
7023     if(cpb_count > 32U){
7024         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7025         return -1;
7026     }
7027
7028     get_bits(&s->gb, 4); /* bit_rate_scale */
7029     get_bits(&s->gb, 4); /* cpb_size_scale */
7030     for(i=0; i<cpb_count; i++){
7031         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7032         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7033         get_bits1(&s->gb);     /* cbr_flag */
7034     }
7035     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7036     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7037     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7038     sps->time_offset_length = get_bits(&s->gb, 5);
7039     sps->cpb_cnt = cpb_count;
7040     return 0;
7041 }
7042
7043 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7044     MpegEncContext * const s = &h->s;
7045     int aspect_ratio_info_present_flag;
7046     unsigned int aspect_ratio_idc;
7047
7048     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7049
7050     if( aspect_ratio_info_present_flag ) {
7051         aspect_ratio_idc= get_bits(&s->gb, 8);
7052         if( aspect_ratio_idc == EXTENDED_SAR ) {
7053             sps->sar.num= get_bits(&s->gb, 16);
7054             sps->sar.den= get_bits(&s->gb, 16);
7055         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7056             sps->sar=  pixel_aspect[aspect_ratio_idc];
7057         }else{
7058             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7059             return -1;
7060         }
7061     }else{
7062         sps->sar.num=
7063         sps->sar.den= 0;
7064     }
7065 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7066
7067     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7068         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7069     }
7070
7071     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7072         get_bits(&s->gb, 3);    /* video_format */
7073         get_bits1(&s->gb);      /* video_full_range_flag */
7074         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7075             get_bits(&s->gb, 8); /* colour_primaries */
7076             get_bits(&s->gb, 8); /* transfer_characteristics */
7077             get_bits(&s->gb, 8); /* matrix_coefficients */
7078         }
7079     }
7080
7081     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7082         s->avctx->chroma_sample_location = get_ue_golomb(&s->gb)+1;  /* chroma_sample_location_type_top_field */
7083         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7084     }
7085
7086     sps->timing_info_present_flag = get_bits1(&s->gb);
7087     if(sps->timing_info_present_flag){
7088         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7089         sps->time_scale = get_bits_long(&s->gb, 32);
7090         if(sps->num_units_in_tick-1 > 0x7FFFFFFEU || sps->time_scale-1 > 0x7FFFFFFEU){
7091             av_log(h->s.avctx, AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
7092             return -1;
7093         }
7094         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7095     }
7096
7097     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7098     if(sps->nal_hrd_parameters_present_flag)
7099         if(decode_hrd_parameters(h, sps) < 0)
7100             return -1;
7101     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7102     if(sps->vcl_hrd_parameters_present_flag)
7103         if(decode_hrd_parameters(h, sps) < 0)
7104             return -1;
7105     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7106         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7107     sps->pic_struct_present_flag = get_bits1(&s->gb);
7108
7109     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7110     if(sps->bitstream_restriction_flag){
7111         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7112         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7113         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7114         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7115         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7116         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7117         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7118
7119         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7120             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7121             return -1;
7122         }
7123     }
7124
7125     return 0;
7126 }
7127
7128 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7129                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7130     MpegEncContext * const s = &h->s;
7131     int i, last = 8, next = 8;
7132     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7133     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7134         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7135     else
7136     for(i=0;i<size;i++){
7137         if(next)
7138             next = (last + get_se_golomb(&s->gb)) & 0xff;
7139         if(!i && !next){ /* matrix not written, we use the preset one */
7140             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7141             break;
7142         }
7143         last = factors[scan[i]] = next ? next : last;
7144     }
7145 }
7146
7147 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7148                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7149     MpegEncContext * const s = &h->s;
7150     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7151     const uint8_t *fallback[4] = {
7152         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7153         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7154         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7155         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7156     };
7157     if(get_bits1(&s->gb)){
7158         sps->scaling_matrix_present |= is_sps;
7159         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7160         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7161         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7162         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7163         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7164         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7165         if(is_sps || pps->transform_8x8_mode){
7166             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7167             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7168         }
7169     }
7170 }
7171
7172 int ff_h264_decode_seq_parameter_set(H264Context *h){
7173     MpegEncContext * const s = &h->s;
7174     int profile_idc, level_idc;
7175     unsigned int sps_id;
7176     int i;
7177     SPS *sps;
7178
7179     profile_idc= get_bits(&s->gb, 8);
7180     get_bits1(&s->gb);   //constraint_set0_flag
7181     get_bits1(&s->gb);   //constraint_set1_flag
7182     get_bits1(&s->gb);   //constraint_set2_flag
7183     get_bits1(&s->gb);   //constraint_set3_flag
7184     get_bits(&s->gb, 4); // reserved
7185     level_idc= get_bits(&s->gb, 8);
7186     sps_id= get_ue_golomb_31(&s->gb);
7187
7188     if(sps_id >= MAX_SPS_COUNT) {
7189         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7190         return -1;
7191     }
7192     sps= av_mallocz(sizeof(SPS));
7193     if(sps == NULL)
7194         return -1;
7195
7196     sps->profile_idc= profile_idc;
7197     sps->level_idc= level_idc;
7198
7199     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7200     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7201     sps->scaling_matrix_present = 0;
7202
7203     if(sps->profile_idc >= 100){ //high profile
7204         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7205         if(sps->chroma_format_idc == 3)
7206             sps->residual_color_transform_flag = get_bits1(&s->gb);
7207         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7208         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7209         sps->transform_bypass = get_bits1(&s->gb);
7210         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7211     }else{
7212         sps->chroma_format_idc= 1;
7213     }
7214
7215     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7216     sps->poc_type= get_ue_golomb_31(&s->gb);
7217
7218     if(sps->poc_type == 0){ //FIXME #define
7219         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7220     } else if(sps->poc_type == 1){//FIXME #define
7221         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7222         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7223         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7224         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7225
7226         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7227             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7228             goto fail;
7229         }
7230
7231         for(i=0; i<sps->poc_cycle_length; i++)
7232             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7233     }else if(sps->poc_type != 2){
7234         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7235         goto fail;
7236     }
7237
7238     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7239     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7240         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7241         goto fail;
7242     }
7243     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7244     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7245     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7246     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7247        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7248         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7249         goto fail;
7250     }
7251
7252     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7253     if(!sps->frame_mbs_only_flag)
7254         sps->mb_aff= get_bits1(&s->gb);
7255     else
7256         sps->mb_aff= 0;
7257
7258     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7259
7260 #ifndef ALLOW_INTERLACE
7261     if(sps->mb_aff)
7262         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7263 #endif
7264     sps->crop= get_bits1(&s->gb);
7265     if(sps->crop){
7266         sps->crop_left  = get_ue_golomb(&s->gb);
7267         sps->crop_right = get_ue_golomb(&s->gb);
7268         sps->crop_top   = get_ue_golomb(&s->gb);
7269         sps->crop_bottom= get_ue_golomb(&s->gb);
7270         if(sps->crop_left || sps->crop_top){
7271             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7272         }
7273         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7274             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7275         }
7276     }else{
7277         sps->crop_left  =
7278         sps->crop_right =
7279         sps->crop_top   =
7280         sps->crop_bottom= 0;
7281     }
7282
7283     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7284     if( sps->vui_parameters_present_flag )
7285         if (decode_vui_parameters(h, sps) < 0)
7286             goto fail;
7287
7288     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7289         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7290                sps_id, sps->profile_idc, sps->level_idc,
7291                sps->poc_type,
7292                sps->ref_frame_count,
7293                sps->mb_width, sps->mb_height,
7294                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7295                sps->direct_8x8_inference_flag ? "8B8" : "",
7296                sps->crop_left, sps->crop_right,
7297                sps->crop_top, sps->crop_bottom,
7298                sps->vui_parameters_present_flag ? "VUI" : "",
7299                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7300                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7301                sps->timing_info_present_flag ? sps->time_scale : 0
7302                );
7303     }
7304
7305     av_free(h->sps_buffers[sps_id]);
7306     h->sps_buffers[sps_id]= sps;
7307     h->sps = *sps;
7308     return 0;
7309 fail:
7310     av_free(sps);
7311     return -1;
7312 }
7313
7314 static void
7315 build_qp_table(PPS *pps, int t, int index)
7316 {
7317     int i;
7318     for(i = 0; i < 52; i++)
7319         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7320 }
7321
7322 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7323     MpegEncContext * const s = &h->s;
7324     unsigned int pps_id= get_ue_golomb(&s->gb);
7325     PPS *pps;
7326
7327     if(pps_id >= MAX_PPS_COUNT) {
7328         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7329         return -1;
7330     }
7331
7332     pps= av_mallocz(sizeof(PPS));
7333     if(pps == NULL)
7334         return -1;
7335     pps->sps_id= get_ue_golomb_31(&s->gb);
7336     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7337         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7338         goto fail;
7339     }
7340
7341     pps->cabac= get_bits1(&s->gb);
7342     pps->pic_order_present= get_bits1(&s->gb);
7343     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7344     if(pps->slice_group_count > 1 ){
7345         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7346         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7347         switch(pps->mb_slice_group_map_type){
7348         case 0:
7349 #if 0
7350 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7351 |    run_length[ i ]                                |1  |ue(v)   |
7352 #endif
7353             break;
7354         case 2:
7355 #if 0
7356 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7357 |{                                                  |   |        |
7358 |    top_left_mb[ i ]                               |1  |ue(v)   |
7359 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7360 |   }                                               |   |        |
7361 #endif
7362             break;
7363         case 3:
7364         case 4:
7365         case 5:
7366 #if 0
7367 |   slice_group_change_direction_flag               |1  |u(1)    |
7368 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7369 #endif
7370             break;
7371         case 6:
7372 #if 0
7373 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7374 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7375 |)                                                  |   |        |
7376 |    slice_group_id[ i ]                            |1  |u(v)    |
7377 #endif
7378             break;
7379         }
7380     }
7381     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7382     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7383     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7384         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7385         goto fail;
7386     }
7387
7388     pps->weighted_pred= get_bits1(&s->gb);
7389     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7390     pps->init_qp= get_se_golomb(&s->gb) + 26;
7391     pps->init_qs= get_se_golomb(&s->gb) + 26;
7392     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7393     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7394     pps->constrained_intra_pred= get_bits1(&s->gb);
7395     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7396
7397     pps->transform_8x8_mode= 0;
7398     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7399     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7400     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7401
7402     if(get_bits_count(&s->gb) < bit_length){
7403         pps->transform_8x8_mode= get_bits1(&s->gb);
7404         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7405         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7406     } else {
7407         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7408     }
7409
7410     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7411     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7412     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7413         h->pps.chroma_qp_diff= 1;
7414
7415     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7416         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7417                pps_id, pps->sps_id,
7418                pps->cabac ? "CABAC" : "CAVLC",
7419                pps->slice_group_count,
7420                pps->ref_count[0], pps->ref_count[1],
7421                pps->weighted_pred ? "weighted" : "",
7422                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7423                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7424                pps->constrained_intra_pred ? "CONSTR" : "",
7425                pps->redundant_pic_cnt_present ? "REDU" : "",
7426                pps->transform_8x8_mode ? "8x8DCT" : ""
7427                );
7428     }
7429
7430     av_free(h->pps_buffers[pps_id]);
7431     h->pps_buffers[pps_id]= pps;
7432     return 0;
7433 fail:
7434     av_free(pps);
7435     return -1;
7436 }
7437
7438 /**
7439  * Call decode_slice() for each context.
7440  *
7441  * @param h h264 master context
7442  * @param context_count number of contexts to execute
7443  */
7444 static void execute_decode_slices(H264Context *h, int context_count){
7445     MpegEncContext * const s = &h->s;
7446     AVCodecContext * const avctx= s->avctx;
7447     H264Context *hx;
7448     int i;
7449
7450     if (s->avctx->hwaccel)
7451         return;
7452     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7453         return;
7454     if(context_count == 1) {
7455         decode_slice(avctx, &h);
7456     } else {
7457         for(i = 1; i < context_count; i++) {
7458             hx = h->thread_context[i];
7459             hx->s.error_recognition = avctx->error_recognition;
7460             hx->s.error_count = 0;
7461         }
7462
7463         avctx->execute(avctx, (void *)decode_slice,
7464                        h->thread_context, NULL, context_count, sizeof(void*));
7465
7466         /* pull back stuff from slices to master context */
7467         hx = h->thread_context[context_count - 1];
7468         s->mb_x = hx->s.mb_x;
7469         s->mb_y = hx->s.mb_y;
7470         s->dropable = hx->s.dropable;
7471         s->picture_structure = hx->s.picture_structure;
7472         for(i = 1; i < context_count; i++)
7473             h->s.error_count += h->thread_context[i]->s.error_count;
7474     }
7475 }
7476
7477
7478 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7479     MpegEncContext * const s = &h->s;
7480     AVCodecContext * const avctx= s->avctx;
7481     int buf_index=0;
7482     H264Context *hx; ///< thread context
7483     int context_count = 0;
7484     int next_avc= h->is_avc ? 0 : buf_size;
7485
7486     h->max_contexts = avctx->thread_count;
7487 #if 0
7488     int i;
7489     for(i=0; i<50; i++){
7490         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7491     }
7492 #endif
7493     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7494         h->current_slice = 0;
7495         if (!s->first_field)
7496             s->current_picture_ptr= NULL;
7497         reset_sei(h);
7498     }
7499
7500     for(;;){
7501         int consumed;
7502         int dst_length;
7503         int bit_length;
7504         const uint8_t *ptr;
7505         int i, nalsize = 0;
7506         int err;
7507
7508         if(buf_index >= next_avc) {
7509             if(buf_index >= buf_size) break;
7510             nalsize = 0;
7511             for(i = 0; i < h->nal_length_size; i++)
7512                 nalsize = (nalsize << 8) | buf[buf_index++];
7513             if(nalsize <= 1 || nalsize > buf_size - buf_index){
7514                 if(nalsize == 1){
7515                     buf_index++;
7516                     continue;
7517                 }else{
7518                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7519                     break;
7520                 }
7521             }
7522             next_avc= buf_index + nalsize;
7523         } else {
7524             // start code prefix search
7525             for(; buf_index + 3 < buf_size; buf_index++){
7526                 // This should always succeed in the first iteration.
7527                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7528                     break;
7529             }
7530
7531             if(buf_index+3 >= buf_size) break;
7532
7533             buf_index+=3;
7534         }
7535
7536         hx = h->thread_context[context_count];
7537
7538         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7539         if (ptr==NULL || dst_length < 0){
7540             return -1;
7541         }
7542         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7543             dst_length--;
7544         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7545
7546         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7547             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7548         }
7549
7550         if (h->is_avc && (nalsize != consumed) && nalsize){
7551             int i, debug_level = AV_LOG_DEBUG;
7552             for (i = consumed; i < nalsize; i++)
7553                 if (buf[buf_index+i])
7554                     debug_level = AV_LOG_ERROR;
7555             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7556         }
7557
7558         buf_index += consumed;
7559
7560         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7561            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7562             continue;
7563
7564       again:
7565         err = 0;
7566         switch(hx->nal_unit_type){
7567         case NAL_IDR_SLICE:
7568             if (h->nal_unit_type != NAL_IDR_SLICE) {
7569                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7570                 return -1;
7571             }
7572             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7573         case NAL_SLICE:
7574             init_get_bits(&hx->s.gb, ptr, bit_length);
7575             hx->intra_gb_ptr=
7576             hx->inter_gb_ptr= &hx->s.gb;
7577             hx->s.data_partitioning = 0;
7578
7579             if((err = decode_slice_header(hx, h)))
7580                break;
7581
7582             if (s->avctx->hwaccel && h->current_slice == 1) {
7583                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7584                     return -1;
7585             }
7586
7587             s->current_picture_ptr->key_frame |=
7588                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7589                     (h->sei_recovery_frame_cnt >= 0);
7590             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7591                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7592                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7593                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7594                && avctx->skip_frame < AVDISCARD_ALL){
7595                 if(avctx->hwaccel) {
7596                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7597                         return -1;
7598                 }else
7599                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7600                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7601                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7602                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7603                 }else
7604                     context_count++;
7605             }
7606             break;
7607         case NAL_DPA:
7608             init_get_bits(&hx->s.gb, ptr, bit_length);
7609             hx->intra_gb_ptr=
7610             hx->inter_gb_ptr= NULL;
7611
7612             if ((err = decode_slice_header(hx, h)) < 0)
7613                 break;
7614
7615             hx->s.data_partitioning = 1;
7616
7617             break;
7618         case NAL_DPB:
7619             init_get_bits(&hx->intra_gb, ptr, bit_length);
7620             hx->intra_gb_ptr= &hx->intra_gb;
7621             break;
7622         case NAL_DPC:
7623             init_get_bits(&hx->inter_gb, ptr, bit_length);
7624             hx->inter_gb_ptr= &hx->inter_gb;
7625
7626             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7627                && s->context_initialized
7628                && s->hurry_up < 5
7629                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7630                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7631                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7632                && avctx->skip_frame < AVDISCARD_ALL)
7633                 context_count++;
7634             break;
7635         case NAL_SEI:
7636             init_get_bits(&s->gb, ptr, bit_length);
7637             ff_h264_decode_sei(h);
7638             break;
7639         case NAL_SPS:
7640             init_get_bits(&s->gb, ptr, bit_length);
7641             ff_h264_decode_seq_parameter_set(h);
7642
7643             if(s->flags& CODEC_FLAG_LOW_DELAY)
7644                 s->low_delay=1;
7645
7646             if(avctx->has_b_frames < 2)
7647                 avctx->has_b_frames= !s->low_delay;
7648             break;
7649         case NAL_PPS:
7650             init_get_bits(&s->gb, ptr, bit_length);
7651
7652             ff_h264_decode_picture_parameter_set(h, bit_length);
7653
7654             break;
7655         case NAL_AUD:
7656         case NAL_END_SEQUENCE:
7657         case NAL_END_STREAM:
7658         case NAL_FILLER_DATA:
7659         case NAL_SPS_EXT:
7660         case NAL_AUXILIARY_SLICE:
7661             break;
7662         default:
7663             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7664         }
7665
7666         if(context_count == h->max_contexts) {
7667             execute_decode_slices(h, context_count);
7668             context_count = 0;
7669         }
7670
7671         if (err < 0)
7672             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7673         else if(err == 1) {
7674             /* Slice could not be decoded in parallel mode, copy down
7675              * NAL unit stuff to context 0 and restart. Note that
7676              * rbsp_buffer is not transferred, but since we no longer
7677              * run in parallel mode this should not be an issue. */
7678             h->nal_unit_type = hx->nal_unit_type;
7679             h->nal_ref_idc   = hx->nal_ref_idc;
7680             hx = h;
7681             goto again;
7682         }
7683     }
7684     if(context_count)
7685         execute_decode_slices(h, context_count);
7686     return buf_index;
7687 }
7688
7689 /**
7690  * returns the number of bytes consumed for building the current frame
7691  */
7692 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7693         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7694         if(pos+10>buf_size) pos=buf_size; // oops ;)
7695
7696         return pos;
7697 }
7698
7699 static int decode_frame(AVCodecContext *avctx,
7700                              void *data, int *data_size,
7701                              AVPacket *avpkt)
7702 {
7703     const uint8_t *buf = avpkt->data;
7704     int buf_size = avpkt->size;
7705     H264Context *h = avctx->priv_data;
7706     MpegEncContext *s = &h->s;
7707     AVFrame *pict = data;
7708     int buf_index;
7709
7710     s->flags= avctx->flags;
7711     s->flags2= avctx->flags2;
7712
7713    /* end of stream, output what is still in the buffers */
7714     if (buf_size == 0) {
7715         Picture *out;
7716         int i, out_idx;
7717
7718 //FIXME factorize this with the output code below
7719         out = h->delayed_pic[0];
7720         out_idx = 0;
7721         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7722             if(h->delayed_pic[i]->poc < out->poc){
7723                 out = h->delayed_pic[i];
7724                 out_idx = i;
7725             }
7726
7727         for(i=out_idx; h->delayed_pic[i]; i++)
7728             h->delayed_pic[i] = h->delayed_pic[i+1];
7729
7730         if(out){
7731             *data_size = sizeof(AVFrame);
7732             *pict= *(AVFrame*)out;
7733         }
7734
7735         return 0;
7736     }
7737
7738     if(h->is_avc && !h->got_avcC) {
7739         int i, cnt, nalsize;
7740         unsigned char *p = avctx->extradata;
7741         if(avctx->extradata_size < 7) {
7742             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7743             return -1;
7744         }
7745         if(*p != 1) {
7746             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7747             return -1;
7748         }
7749         /* sps and pps in the avcC always have length coded with 2 bytes,
7750            so put a fake nal_length_size = 2 while parsing them */
7751         h->nal_length_size = 2;
7752         // Decode sps from avcC
7753         cnt = *(p+5) & 0x1f; // Number of sps
7754         p += 6;
7755         for (i = 0; i < cnt; i++) {
7756             nalsize = AV_RB16(p) + 2;
7757             if(decode_nal_units(h, p, nalsize) < 0) {
7758                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7759                 return -1;
7760             }
7761             p += nalsize;
7762         }
7763         // Decode pps from avcC
7764         cnt = *(p++); // Number of pps
7765         for (i = 0; i < cnt; i++) {
7766             nalsize = AV_RB16(p) + 2;
7767             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7768                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7769                 return -1;
7770             }
7771             p += nalsize;
7772         }
7773         // Now store right nal length size, that will be use to parse all other nals
7774         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7775         // Do not reparse avcC
7776         h->got_avcC = 1;
7777     }
7778
7779     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7780         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7781             return -1;
7782         h->got_avcC = 1;
7783     }
7784
7785     buf_index=decode_nal_units(h, buf, buf_size);
7786     if(buf_index < 0)
7787         return -1;
7788
7789     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7790         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7791         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7792         return -1;
7793     }
7794
7795     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7796         Picture *out = s->current_picture_ptr;
7797         Picture *cur = s->current_picture_ptr;
7798         int i, pics, cross_idr, out_of_order, out_idx;
7799
7800         field_end(h);
7801
7802         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7803             /* Wait for second field. */
7804             *data_size = 0;
7805
7806         } else {
7807             cur->interlaced_frame = 0;
7808             cur->repeat_pict = 0;
7809
7810             /* Signal interlacing information externally. */
7811             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7812
7813             if(h->sps.pic_struct_present_flag){
7814                 switch (h->sei_pic_struct)
7815                 {
7816                 case SEI_PIC_STRUCT_FRAME:
7817                     break;
7818                 case SEI_PIC_STRUCT_TOP_FIELD:
7819                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7820                     cur->interlaced_frame = 1;
7821                     break;
7822                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7823                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7824                     if (FIELD_OR_MBAFF_PICTURE)
7825                         cur->interlaced_frame = 1;
7826                     else
7827                         // try to flag soft telecine progressive
7828                         cur->interlaced_frame = h->prev_interlaced_frame;
7829                     break;
7830                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7831                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7832                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7833                     // From these hints, let the applications decide if they apply deinterlacing.
7834                     cur->repeat_pict = 1;
7835                     break;
7836                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7837                     // Force progressive here, as doubling interlaced frame is a bad idea.
7838                     cur->repeat_pict = 2;
7839                     break;
7840                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7841                     cur->repeat_pict = 4;
7842                     break;
7843                 }
7844
7845                 if ((h->sei_ct_type & 3) && h->sei_pic_struct <= SEI_PIC_STRUCT_BOTTOM_TOP)
7846                     cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7847             }else{
7848                 /* Derive interlacing flag from used decoding process. */
7849                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7850             }
7851             h->prev_interlaced_frame = cur->interlaced_frame;
7852
7853             if (cur->field_poc[0] != cur->field_poc[1]){
7854                 /* Derive top_field_first from field pocs. */
7855                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7856             }else{
7857                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7858                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7859                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7860                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7861                         cur->top_field_first = 1;
7862                     else
7863                         cur->top_field_first = 0;
7864                 }else{
7865                     /* Most likely progressive */
7866                     cur->top_field_first = 0;
7867                 }
7868             }
7869
7870         //FIXME do something with unavailable reference frames
7871
7872             /* Sort B-frames into display order */
7873
7874             if(h->sps.bitstream_restriction_flag
7875                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7876                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7877                 s->low_delay = 0;
7878             }
7879
7880             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7881                && !h->sps.bitstream_restriction_flag){
7882                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7883                 s->low_delay= 0;
7884             }
7885
7886             pics = 0;
7887             while(h->delayed_pic[pics]) pics++;
7888
7889             assert(pics <= MAX_DELAYED_PIC_COUNT);
7890
7891             h->delayed_pic[pics++] = cur;
7892             if(cur->reference == 0)
7893                 cur->reference = DELAYED_PIC_REF;
7894
7895             out = h->delayed_pic[0];
7896             out_idx = 0;
7897             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame && !h->delayed_pic[i]->mmco_reset; i++)
7898                 if(h->delayed_pic[i]->poc < out->poc){
7899                     out = h->delayed_pic[i];
7900                     out_idx = i;
7901                 }
7902             cross_idr = !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame || h->delayed_pic[0]->mmco_reset;
7903
7904             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7905
7906             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7907                 { }
7908             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7909                || (s->low_delay &&
7910                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7911                  || cur->pict_type == FF_B_TYPE)))
7912             {
7913                 s->low_delay = 0;
7914                 s->avctx->has_b_frames++;
7915             }
7916
7917             if(out_of_order || pics > s->avctx->has_b_frames){
7918                 out->reference &= ~DELAYED_PIC_REF;
7919                 for(i=out_idx; h->delayed_pic[i]; i++)
7920                     h->delayed_pic[i] = h->delayed_pic[i+1];
7921             }
7922             if(!out_of_order && pics > s->avctx->has_b_frames){
7923                 *data_size = sizeof(AVFrame);
7924
7925                 h->outputed_poc = out->poc;
7926                 *pict= *(AVFrame*)out;
7927             }else{
7928                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7929             }
7930         }
7931     }
7932
7933     assert(pict->data[0] || !*data_size);
7934     ff_print_debug_info(s, pict);
7935 //printf("out %d\n", (int)pict->data[0]);
7936
7937     return get_consumed_bytes(s, buf_index, buf_size);
7938 }
7939 #if 0
7940 static inline void fill_mb_avail(H264Context *h){
7941     MpegEncContext * const s = &h->s;
7942     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7943
7944     if(s->mb_y){
7945         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7946         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7947         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7948     }else{
7949         h->mb_avail[0]=
7950         h->mb_avail[1]=
7951         h->mb_avail[2]= 0;
7952     }
7953     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7954     h->mb_avail[4]= 1; //FIXME move out
7955     h->mb_avail[5]= 0; //FIXME move out
7956 }
7957 #endif
7958
7959 #ifdef TEST
7960 #undef printf
7961 #undef random
7962 #define COUNT 8000
7963 #define SIZE (COUNT*40)
7964 int main(void){
7965     int i;
7966     uint8_t temp[SIZE];
7967     PutBitContext pb;
7968     GetBitContext gb;
7969 //    int int_temp[10000];
7970     DSPContext dsp;
7971     AVCodecContext avctx;
7972
7973     dsputil_init(&dsp, &avctx);
7974
7975     init_put_bits(&pb, temp, SIZE);
7976     printf("testing unsigned exp golomb\n");
7977     for(i=0; i<COUNT; i++){
7978         START_TIMER
7979         set_ue_golomb(&pb, i);
7980         STOP_TIMER("set_ue_golomb");
7981     }
7982     flush_put_bits(&pb);
7983
7984     init_get_bits(&gb, temp, 8*SIZE);
7985     for(i=0; i<COUNT; i++){
7986         int j, s;
7987
7988         s= show_bits(&gb, 24);
7989
7990         START_TIMER
7991         j= get_ue_golomb(&gb);
7992         if(j != i){
7993             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7994 //            return -1;
7995         }
7996         STOP_TIMER("get_ue_golomb");
7997     }
7998
7999
8000     init_put_bits(&pb, temp, SIZE);
8001     printf("testing signed exp golomb\n");
8002     for(i=0; i<COUNT; i++){
8003         START_TIMER
8004         set_se_golomb(&pb, i - COUNT/2);
8005         STOP_TIMER("set_se_golomb");
8006     }
8007     flush_put_bits(&pb);
8008
8009     init_get_bits(&gb, temp, 8*SIZE);
8010     for(i=0; i<COUNT; i++){
8011         int j, s;
8012
8013         s= show_bits(&gb, 24);
8014
8015         START_TIMER
8016         j= get_se_golomb(&gb);
8017         if(j != i - COUNT/2){
8018             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8019 //            return -1;
8020         }
8021         STOP_TIMER("get_se_golomb");
8022     }
8023
8024 #if 0
8025     printf("testing 4x4 (I)DCT\n");
8026
8027     DCTELEM block[16];
8028     uint8_t src[16], ref[16];
8029     uint64_t error= 0, max_error=0;
8030
8031     for(i=0; i<COUNT; i++){
8032         int j;
8033 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8034         for(j=0; j<16; j++){
8035             ref[j]= random()%255;
8036             src[j]= random()%255;
8037         }
8038
8039         h264_diff_dct_c(block, src, ref, 4);
8040
8041         //normalize
8042         for(j=0; j<16; j++){
8043 //            printf("%d ", block[j]);
8044             block[j]= block[j]*4;
8045             if(j&1) block[j]= (block[j]*4 + 2)/5;
8046             if(j&4) block[j]= (block[j]*4 + 2)/5;
8047         }
8048 //        printf("\n");
8049
8050         s->dsp.h264_idct_add(ref, block, 4);
8051 /*        for(j=0; j<16; j++){
8052             printf("%d ", ref[j]);
8053         }
8054         printf("\n");*/
8055
8056         for(j=0; j<16; j++){
8057             int diff= FFABS(src[j] - ref[j]);
8058
8059             error+= diff*diff;
8060             max_error= FFMAX(max_error, diff);
8061         }
8062     }
8063     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8064     printf("testing quantizer\n");
8065     for(qp=0; qp<52; qp++){
8066         for(i=0; i<16; i++)
8067             src1_block[i]= src2_block[i]= random()%255;
8068
8069     }
8070     printf("Testing NAL layer\n");
8071
8072     uint8_t bitstream[COUNT];
8073     uint8_t nal[COUNT*2];
8074     H264Context h;
8075     memset(&h, 0, sizeof(H264Context));
8076
8077     for(i=0; i<COUNT; i++){
8078         int zeros= i;
8079         int nal_length;
8080         int consumed;
8081         int out_length;
8082         uint8_t *out;
8083         int j;
8084
8085         for(j=0; j<COUNT; j++){
8086             bitstream[j]= (random() % 255) + 1;
8087         }
8088
8089         for(j=0; j<zeros; j++){
8090             int pos= random() % COUNT;
8091             while(bitstream[pos] == 0){
8092                 pos++;
8093                 pos %= COUNT;
8094             }
8095             bitstream[pos]=0;
8096         }
8097
8098         START_TIMER
8099
8100         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8101         if(nal_length<0){
8102             printf("encoding failed\n");
8103             return -1;
8104         }
8105
8106         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8107
8108         STOP_TIMER("NAL")
8109
8110         if(out_length != COUNT){
8111             printf("incorrect length %d %d\n", out_length, COUNT);
8112             return -1;
8113         }
8114
8115         if(consumed != nal_length){
8116             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8117             return -1;
8118         }
8119
8120         if(memcmp(bitstream, out, COUNT)){
8121             printf("mismatch\n");
8122             return -1;
8123         }
8124     }
8125 #endif
8126
8127     printf("Testing RBSP\n");
8128
8129
8130     return 0;
8131 }
8132 #endif /* TEST */
8133
8134
8135 av_cold void ff_h264_free_context(H264Context *h)
8136 {
8137     int i;
8138
8139     free_tables(h); //FIXME cleanup init stuff perhaps
8140
8141     for(i = 0; i < MAX_SPS_COUNT; i++)
8142         av_freep(h->sps_buffers + i);
8143
8144     for(i = 0; i < MAX_PPS_COUNT; i++)
8145         av_freep(h->pps_buffers + i);
8146 }
8147
8148 static av_cold int decode_end(AVCodecContext *avctx)
8149 {
8150     H264Context *h = avctx->priv_data;
8151     MpegEncContext *s = &h->s;
8152
8153     ff_h264_free_context(h);
8154
8155     MPV_common_end(s);
8156
8157 //    memset(h, 0, sizeof(H264Context));
8158
8159     return 0;
8160 }
8161
8162
8163 AVCodec h264_decoder = {
8164     "h264",
8165     CODEC_TYPE_VIDEO,
8166     CODEC_ID_H264,
8167     sizeof(H264Context),
8168     decode_init,
8169     NULL,
8170     decode_end,
8171     decode_frame,
8172     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8173     .flush= flush_dpb,
8174     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8175     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8176 };
8177
8178 #if CONFIG_H264_VDPAU_DECODER
8179 AVCodec h264_vdpau_decoder = {
8180     "h264_vdpau",
8181     CODEC_TYPE_VIDEO,
8182     CODEC_ID_H264,
8183     sizeof(H264Context),
8184     decode_init,
8185     NULL,
8186     decode_end,
8187     decode_frame,
8188     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8189     .flush= flush_dpb,
8190     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8191 };
8192 #endif
8193
8194 #if CONFIG_SVQ3_DECODER
8195 #include "svq3.c"
8196 #endif