libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000     }
2001 }
2002
2003 static void init_dequant8_coeff_table(H264Context *h){
2004     int i,q,x;
2005     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2006     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2007     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2008
2009     for(i=0; i<2; i++ ){
2010         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2011             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2012             break;
2013         }
2014
2015         for(q=0; q<52; q++){
2016             int shift = div6[q];
2017             int idx = rem6[q];
2018             for(x=0; x<64; x++)
2019                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2020                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2021                     h->pps.scaling_matrix8[i][x]) << shift;
2022         }
2023     }
2024 }
2025
2026 static void init_dequant4_coeff_table(H264Context *h){
2027     int i,j,q,x;
2028     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2029     for(i=0; i<6; i++ ){
2030         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2031         for(j=0; j<i; j++){
2032             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2033                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2034                 break;
2035             }
2036         }
2037         if(j<i)
2038             continue;
2039
2040         for(q=0; q<52; q++){
2041             int shift = div6[q] + 2;
2042             int idx = rem6[q];
2043             for(x=0; x<16; x++)
2044                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2045                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2046                     h->pps.scaling_matrix4[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant_tables(H264Context *h){
2052     int i,x;
2053     init_dequant4_coeff_table(h);
2054     if(h->pps.transform_8x8_mode)
2055         init_dequant8_coeff_table(h);
2056     if(h->sps.transform_bypass){
2057         for(i=0; i<6; i++)
2058             for(x=0; x<16; x++)
2059                 h->dequant4_coeff[i][0][x] = 1<<6;
2060         if(h->pps.transform_8x8_mode)
2061             for(i=0; i<2; i++)
2062                 for(x=0; x<64; x++)
2063                     h->dequant8_coeff[i][0][x] = 1<<6;
2064     }
2065 }
2066
2067
2068 /**
2069  * allocates tables.
2070  * needs width/height
2071  */
2072 static int alloc_tables(H264Context *h){
2073     MpegEncContext * const s = &h->s;
2074     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2075     int x,y;
2076
2077     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2078
2079     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2080     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2081     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2082
2083     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2084     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2085     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2087
2088     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2089     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2090
2091     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2092     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2093     for(y=0; y<s->mb_height; y++){
2094         for(x=0; x<s->mb_width; x++){
2095             const int mb_xy= x + y*s->mb_stride;
2096             const int b_xy = 4*x + 4*y*h->b_stride;
2097             const int b8_xy= 2*x + 2*y*h->b8_stride;
2098
2099             h->mb2b_xy [mb_xy]= b_xy;
2100             h->mb2b8_xy[mb_xy]= b8_xy;
2101         }
2102     }
2103
2104     s->obmc_scratchpad = NULL;
2105
2106     if(!h->dequant4_coeff[0])
2107         init_dequant_tables(h);
2108
2109     return 0;
2110 fail:
2111     free_tables(h);
2112     return -1;
2113 }
2114
2115 /**
2116  * Mimic alloc_tables(), but for every context thread.
2117  */
2118 static void clone_tables(H264Context *dst, H264Context *src){
2119     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2120     dst->non_zero_count           = src->non_zero_count;
2121     dst->slice_table              = src->slice_table;
2122     dst->cbp_table                = src->cbp_table;
2123     dst->mb2b_xy                  = src->mb2b_xy;
2124     dst->mb2b8_xy                 = src->mb2b8_xy;
2125     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2126     dst->mvd_table[0]             = src->mvd_table[0];
2127     dst->mvd_table[1]             = src->mvd_table[1];
2128     dst->direct_table             = src->direct_table;
2129
2130     dst->s.obmc_scratchpad = NULL;
2131     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2132 }
2133
2134 /**
2135  * Init context
2136  * Allocate buffers which are not shared amongst multiple threads.
2137  */
2138 static int context_init(H264Context *h){
2139     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2140     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141
2142     return 0;
2143 fail:
2144     return -1; // free_tables will clean up for us
2145 }
2146
2147 static av_cold void common_init(H264Context *h){
2148     MpegEncContext * const s = &h->s;
2149
2150     s->width = s->avctx->width;
2151     s->height = s->avctx->height;
2152     s->codec_id= s->avctx->codec->id;
2153
2154     ff_h264_pred_init(&h->hpc, s->codec_id);
2155
2156     h->dequant_coeff_pps= -1;
2157     s->unrestricted_mv=1;
2158     s->decode=1; //FIXME
2159
2160     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2161
2162     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2163     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2164 }
2165
2166 /**
2167  * Reset SEI values at the beginning of the frame.
2168  *
2169  * @param h H.264 context.
2170  */
2171 static void reset_sei(H264Context *h) {
2172     h->sei_recovery_frame_cnt       = -1;
2173     h->sei_dpb_output_delay         =  0;
2174     h->sei_cpb_removal_delay        = -1;
2175     h->sei_buffering_period_present =  0;
2176 }
2177
2178 static av_cold int decode_init(AVCodecContext *avctx){
2179     H264Context *h= avctx->priv_data;
2180     MpegEncContext * const s = &h->s;
2181
2182     MPV_decode_defaults(s);
2183
2184     s->avctx = avctx;
2185     common_init(h);
2186
2187     s->out_format = FMT_H264;
2188     s->workaround_bugs= avctx->workaround_bugs;
2189
2190     // set defaults
2191 //    s->decode_mb= ff_h263_decode_mb;
2192     s->quarter_sample = 1;
2193     if(!avctx->has_b_frames)
2194     s->low_delay= 1;
2195
2196     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2197         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2198     else
2199         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2200     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2201
2202     decode_init_vlc();
2203
2204     if(avctx->extradata_size > 0 && avctx->extradata &&
2205        *(char *)avctx->extradata == 1){
2206         h->is_avc = 1;
2207         h->got_avcC = 0;
2208     } else {
2209         h->is_avc = 0;
2210     }
2211
2212     h->thread_context[0] = h;
2213     h->outputed_poc = INT_MIN;
2214     h->prev_poc_msb= 1<<16;
2215     reset_sei(h);
2216     if(avctx->codec_id == CODEC_ID_H264){
2217         if(avctx->ticks_per_frame == 1){
2218             s->avctx->time_base.den *=2;
2219         }
2220         avctx->ticks_per_frame = 2;
2221     }
2222     return 0;
2223 }
2224
2225 static int frame_start(H264Context *h){
2226     MpegEncContext * const s = &h->s;
2227     int i;
2228
2229     if(MPV_frame_start(s, s->avctx) < 0)
2230         return -1;
2231     ff_er_frame_start(s);
2232     /*
2233      * MPV_frame_start uses pict_type to derive key_frame.
2234      * This is incorrect for H.264; IDR markings must be used.
2235      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2236      * See decode_nal_units().
2237      */
2238     s->current_picture_ptr->key_frame= 0;
2239
2240     assert(s->linesize && s->uvlinesize);
2241
2242     for(i=0; i<16; i++){
2243         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2244         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2245     }
2246     for(i=0; i<4; i++){
2247         h->block_offset[16+i]=
2248         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2249         h->block_offset[24+16+i]=
2250         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2251     }
2252
2253     /* can't be in alloc_tables because linesize isn't known there.
2254      * FIXME: redo bipred weight to not require extra buffer? */
2255     for(i = 0; i < s->avctx->thread_count; i++)
2256         if(!h->thread_context[i]->s.obmc_scratchpad)
2257             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2258
2259     /* some macroblocks will be accessed before they're available */
2260     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2261         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2262
2263 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2264
2265     // We mark the current picture as non-reference after allocating it, so
2266     // that if we break out due to an error it can be released automatically
2267     // in the next MPV_frame_start().
2268     // SVQ3 as well as most other codecs have only last/next/current and thus
2269     // get released even with set reference, besides SVQ3 and others do not
2270     // mark frames as reference later "naturally".
2271     if(s->codec_id != CODEC_ID_SVQ3)
2272         s->current_picture_ptr->reference= 0;
2273
2274     s->current_picture_ptr->field_poc[0]=
2275     s->current_picture_ptr->field_poc[1]= INT_MAX;
2276     assert(s->current_picture_ptr->long_ref==0);
2277
2278     return 0;
2279 }
2280
2281 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2282     MpegEncContext * const s = &h->s;
2283     int i;
2284     int step    = 1;
2285     int offset  = 1;
2286     int uvoffset= 1;
2287     int top_idx = 1;
2288     int skiplast= 0;
2289
2290     src_y  -=   linesize;
2291     src_cb -= uvlinesize;
2292     src_cr -= uvlinesize;
2293
2294     if(!simple && FRAME_MBAFF){
2295         if(s->mb_y&1){
2296             offset  = MB_MBAFF ? 1 : 17;
2297             uvoffset= MB_MBAFF ? 1 : 9;
2298             if(!MB_MBAFF){
2299                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2300                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2301                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2302                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2303                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2304                 }
2305             }
2306         }else{
2307             if(!MB_MBAFF){
2308                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2309                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2310                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2311                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2312                 }
2313                 skiplast= 1;
2314             }
2315             offset  =
2316             uvoffset=
2317             top_idx = MB_MBAFF ? 0 : 1;
2318         }
2319         step= MB_MBAFF ? 2 : 1;
2320     }
2321
2322     // There are two lines saved, the line above the the top macroblock of a pair,
2323     // and the line above the bottom macroblock
2324     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2325     for(i=1; i<17 - skiplast; i++){
2326         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2327     }
2328
2329     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2330     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2331
2332     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2333         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2334         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2335         for(i=1; i<9 - skiplast; i++){
2336             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2337             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2338         }
2339         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2340         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2341     }
2342 }
2343
2344 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2345     MpegEncContext * const s = &h->s;
2346     int temp8, i;
2347     uint64_t temp64;
2348     int deblock_left;
2349     int deblock_top;
2350     int mb_xy;
2351     int step    = 1;
2352     int offset  = 1;
2353     int uvoffset= 1;
2354     int top_idx = 1;
2355
2356     if(!simple && FRAME_MBAFF){
2357         if(s->mb_y&1){
2358             offset  = MB_MBAFF ? 1 : 17;
2359             uvoffset= MB_MBAFF ? 1 : 9;
2360         }else{
2361             offset  =
2362             uvoffset=
2363             top_idx = MB_MBAFF ? 0 : 1;
2364         }
2365         step= MB_MBAFF ? 2 : 1;
2366     }
2367
2368     if(h->deblocking_filter == 2) {
2369         mb_xy = h->mb_xy;
2370         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2371         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2372     } else {
2373         deblock_left = (s->mb_x > 0);
2374         deblock_top =  (s->mb_y > !!MB_FIELD);
2375     }
2376
2377     src_y  -=   linesize + 1;
2378     src_cb -= uvlinesize + 1;
2379     src_cr -= uvlinesize + 1;
2380
2381 #define XCHG(a,b,t,xchg)\
2382 t= a;\
2383 if(xchg)\
2384     a= b;\
2385 b= t;
2386
2387     if(deblock_left){
2388         for(i = !deblock_top; i<16; i++){
2389             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2390         }
2391         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2392     }
2393
2394     if(deblock_top){
2395         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2396         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2397         if(s->mb_x+1 < s->mb_width){
2398             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2399         }
2400     }
2401
2402     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2403         if(deblock_left){
2404             for(i = !deblock_top; i<8; i++){
2405                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2406                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2407             }
2408             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2409             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2410         }
2411         if(deblock_top){
2412             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2413             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2414         }
2415     }
2416 }
2417
2418 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2419     MpegEncContext * const s = &h->s;
2420     const int mb_x= s->mb_x;
2421     const int mb_y= s->mb_y;
2422     const int mb_xy= h->mb_xy;
2423     const int mb_type= s->current_picture.mb_type[mb_xy];
2424     uint8_t  *dest_y, *dest_cb, *dest_cr;
2425     int linesize, uvlinesize /*dct_offset*/;
2426     int i;
2427     int *block_offset = &h->block_offset[0];
2428     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2429     /* is_h264 should always be true if SVQ3 is disabled. */
2430     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2431     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2432     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2433
2434     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2435     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2436     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2437
2438     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2439     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2440
2441     if (!simple && MB_FIELD) {
2442         linesize   = h->mb_linesize   = s->linesize * 2;
2443         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2444         block_offset = &h->block_offset[24];
2445         if(mb_y&1){ //FIXME move out of this function?
2446             dest_y -= s->linesize*15;
2447             dest_cb-= s->uvlinesize*7;
2448             dest_cr-= s->uvlinesize*7;
2449         }
2450         if(FRAME_MBAFF) {
2451             int list;
2452             for(list=0; list<h->list_count; list++){
2453                 if(!USES_LIST(mb_type, list))
2454                     continue;
2455                 if(IS_16X16(mb_type)){
2456                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2457                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2458                 }else{
2459                     for(i=0; i<16; i+=4){
2460                         int ref = h->ref_cache[list][scan8[i]];
2461                         if(ref >= 0)
2462                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2463                     }
2464                 }
2465             }
2466         }
2467     } else {
2468         linesize   = h->mb_linesize   = s->linesize;
2469         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2470 //        dct_offset = s->linesize * 16;
2471     }
2472
2473     if (!simple && IS_INTRA_PCM(mb_type)) {
2474         for (i=0; i<16; i++) {
2475             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2476         }
2477         for (i=0; i<8; i++) {
2478             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2479             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2480         }
2481     } else {
2482         if(IS_INTRA(mb_type)){
2483             if(h->deblocking_filter)
2484                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2485
2486             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2487                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2488                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2489             }
2490
2491             if(IS_INTRA4x4(mb_type)){
2492                 if(simple || !s->encoding){
2493                     if(IS_8x8DCT(mb_type)){
2494                         if(transform_bypass){
2495                             idct_dc_add =
2496                             idct_add    = s->dsp.add_pixels8;
2497                         }else{
2498                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2499                             idct_add    = s->dsp.h264_idct8_add;
2500                         }
2501                         for(i=0; i<16; i+=4){
2502                             uint8_t * const ptr= dest_y + block_offset[i];
2503                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2504                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2505                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2506                             }else{
2507                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2508                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2509                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2510                                 if(nnz){
2511                                     if(nnz == 1 && h->mb[i*16])
2512                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2513                                     else
2514                                         idct_add   (ptr, h->mb + i*16, linesize);
2515                                 }
2516                             }
2517                         }
2518                     }else{
2519                         if(transform_bypass){
2520                             idct_dc_add =
2521                             idct_add    = s->dsp.add_pixels4;
2522                         }else{
2523                             idct_dc_add = s->dsp.h264_idct_dc_add;
2524                             idct_add    = s->dsp.h264_idct_add;
2525                         }
2526                         for(i=0; i<16; i++){
2527                             uint8_t * const ptr= dest_y + block_offset[i];
2528                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2529
2530                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2531                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2532                             }else{
2533                                 uint8_t *topright;
2534                                 int nnz, tr;
2535                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2536                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2537                                     assert(mb_y || linesize <= block_offset[i]);
2538                                     if(!topright_avail){
2539                                         tr= ptr[3 - linesize]*0x01010101;
2540                                         topright= (uint8_t*) &tr;
2541                                     }else
2542                                         topright= ptr + 4 - linesize;
2543                                 }else
2544                                     topright= NULL;
2545
2546                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2547                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2548                                 if(nnz){
2549                                     if(is_h264){
2550                                         if(nnz == 1 && h->mb[i*16])
2551                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2552                                         else
2553                                             idct_add   (ptr, h->mb + i*16, linesize);
2554                                     }else
2555                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2556                                 }
2557                             }
2558                         }
2559                     }
2560                 }
2561             }else{
2562                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2563                 if(is_h264){
2564                     if(!transform_bypass)
2565                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2566                 }else
2567                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2568             }
2569             if(h->deblocking_filter)
2570                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2571         }else if(is_h264){
2572             hl_motion(h, dest_y, dest_cb, dest_cr,
2573                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2574                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2575                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2576         }
2577
2578
2579         if(!IS_INTRA4x4(mb_type)){
2580             if(is_h264){
2581                 if(IS_INTRA16x16(mb_type)){
2582                     if(transform_bypass){
2583                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2584                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2585                         }else{
2586                             for(i=0; i<16; i++){
2587                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2588                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2589                             }
2590                         }
2591                     }else{
2592                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2593                     }
2594                 }else if(h->cbp&15){
2595                     if(transform_bypass){
2596                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2597                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2598                         for(i=0; i<16; i+=di){
2599                             if(h->non_zero_count_cache[ scan8[i] ]){
2600                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2601                             }
2602                         }
2603                     }else{
2604                         if(IS_8x8DCT(mb_type)){
2605                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2606                         }else{
2607                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2608                         }
2609                     }
2610                 }
2611             }else{
2612                 for(i=0; i<16; i++){
2613                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2614                         uint8_t * const ptr= dest_y + block_offset[i];
2615                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2616                     }
2617                 }
2618             }
2619         }
2620
2621         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2622             uint8_t *dest[2] = {dest_cb, dest_cr};
2623             if(transform_bypass){
2624                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2625                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2626                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2627                 }else{
2628                     idct_add = s->dsp.add_pixels4;
2629                     for(i=16; i<16+8; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2631                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                     }
2633                 }
2634             }else{
2635                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2636                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2637                 if(is_h264){
2638                     idct_add = s->dsp.h264_idct_add;
2639                     idct_dc_add = s->dsp.h264_idct_dc_add;
2640                     for(i=16; i<16+8; i++){
2641                         if(h->non_zero_count_cache[ scan8[i] ])
2642                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2643                         else if(h->mb[i*16])
2644                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2645                     }
2646                 }else{
2647                     for(i=16; i<16+8; i++){
2648                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2649                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2650                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2651                         }
2652                     }
2653                 }
2654             }
2655         }
2656     }
2657     if(h->cbp || IS_INTRA(mb_type))
2658         s->dsp.clear_blocks(h->mb);
2659
2660     if(h->deblocking_filter) {
2661         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2662         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2663         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2664         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2665         if (!simple && FRAME_MBAFF) {
2666             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2667         } else {
2668             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2669         }
2670     }
2671 }
2672
2673 /**
2674  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2675  */
2676 static void hl_decode_mb_simple(H264Context *h){
2677     hl_decode_mb_internal(h, 1);
2678 }
2679
2680 /**
2681  * Process a macroblock; this handles edge cases, such as interlacing.
2682  */
2683 static void av_noinline hl_decode_mb_complex(H264Context *h){
2684     hl_decode_mb_internal(h, 0);
2685 }
2686
2687 static void hl_decode_mb(H264Context *h){
2688     MpegEncContext * const s = &h->s;
2689     const int mb_xy= h->mb_xy;
2690     const int mb_type= s->current_picture.mb_type[mb_xy];
2691     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2692
2693     if (is_complex)
2694         hl_decode_mb_complex(h);
2695     else hl_decode_mb_simple(h);
2696 }
2697
2698 static void pic_as_field(Picture *pic, const int parity){
2699     int i;
2700     for (i = 0; i < 4; ++i) {
2701         if (parity == PICT_BOTTOM_FIELD)
2702             pic->data[i] += pic->linesize[i];
2703         pic->reference = parity;
2704         pic->linesize[i] *= 2;
2705     }
2706     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2707 }
2708
2709 static int split_field_copy(Picture *dest, Picture *src,
2710                             int parity, int id_add){
2711     int match = !!(src->reference & parity);
2712
2713     if (match) {
2714         *dest = *src;
2715         if(parity != PICT_FRAME){
2716             pic_as_field(dest, parity);
2717             dest->pic_id *= 2;
2718             dest->pic_id += id_add;
2719         }
2720     }
2721
2722     return match;
2723 }
2724
2725 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2726     int i[2]={0};
2727     int index=0;
2728
2729     while(i[0]<len || i[1]<len){
2730         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2731             i[0]++;
2732         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2733             i[1]++;
2734         if(i[0] < len){
2735             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2736             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2737         }
2738         if(i[1] < len){
2739             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2740             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2741         }
2742     }
2743
2744     return index;
2745 }
2746
2747 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2748     int i, best_poc;
2749     int out_i= 0;
2750
2751     for(;;){
2752         best_poc= dir ? INT_MIN : INT_MAX;
2753
2754         for(i=0; i<len; i++){
2755             const int poc= src[i]->poc;
2756             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2757                 best_poc= poc;
2758                 sorted[out_i]= src[i];
2759             }
2760         }
2761         if(best_poc == (dir ? INT_MIN : INT_MAX))
2762             break;
2763         limit= sorted[out_i++]->poc - dir;
2764     }
2765     return out_i;
2766 }
2767
2768 /**
2769  * fills the default_ref_list.
2770  */
2771 static int fill_default_ref_list(H264Context *h){
2772     MpegEncContext * const s = &h->s;
2773     int i, len;
2774
2775     if(h->slice_type_nos==FF_B_TYPE){
2776         Picture *sorted[32];
2777         int cur_poc, list;
2778         int lens[2];
2779
2780         if(FIELD_PICTURE)
2781             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2782         else
2783             cur_poc= s->current_picture_ptr->poc;
2784
2785         for(list= 0; list<2; list++){
2786             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2787             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2788             assert(len<=32);
2789             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2790             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2791             assert(len<=32);
2792
2793             if(len < h->ref_count[list])
2794                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2795             lens[list]= len;
2796         }
2797
2798         if(lens[0] == lens[1] && lens[1] > 1){
2799             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2800             if(i == lens[0])
2801                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2802         }
2803     }else{
2804         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2805         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2806         assert(len <= 32);
2807         if(len < h->ref_count[0])
2808             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2809     }
2810 #ifdef TRACE
2811     for (i=0; i<h->ref_count[0]; i++) {
2812         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2813     }
2814     if(h->slice_type_nos==FF_B_TYPE){
2815         for (i=0; i<h->ref_count[1]; i++) {
2816             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2817         }
2818     }
2819 #endif
2820     return 0;
2821 }
2822
2823 static void print_short_term(H264Context *h);
2824 static void print_long_term(H264Context *h);
2825
2826 /**
2827  * Extract structure information about the picture described by pic_num in
2828  * the current decoding context (frame or field). Note that pic_num is
2829  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2830  * @param pic_num picture number for which to extract structure information
2831  * @param structure one of PICT_XXX describing structure of picture
2832  *                      with pic_num
2833  * @return frame number (short term) or long term index of picture
2834  *         described by pic_num
2835  */
2836 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2837     MpegEncContext * const s = &h->s;
2838
2839     *structure = s->picture_structure;
2840     if(FIELD_PICTURE){
2841         if (!(pic_num & 1))
2842             /* opposite field */
2843             *structure ^= PICT_FRAME;
2844         pic_num >>= 1;
2845     }
2846
2847     return pic_num;
2848 }
2849
2850 static int decode_ref_pic_list_reordering(H264Context *h){
2851     MpegEncContext * const s = &h->s;
2852     int list, index, pic_structure;
2853
2854     print_short_term(h);
2855     print_long_term(h);
2856
2857     for(list=0; list<h->list_count; list++){
2858         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2859
2860         if(get_bits1(&s->gb)){
2861             int pred= h->curr_pic_num;
2862
2863             for(index=0; ; index++){
2864                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2865                 unsigned int pic_id;
2866                 int i;
2867                 Picture *ref = NULL;
2868
2869                 if(reordering_of_pic_nums_idc==3)
2870                     break;
2871
2872                 if(index >= h->ref_count[list]){
2873                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2874                     return -1;
2875                 }
2876
2877                 if(reordering_of_pic_nums_idc<3){
2878                     if(reordering_of_pic_nums_idc<2){
2879                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2880                         int frame_num;
2881
2882                         if(abs_diff_pic_num > h->max_pic_num){
2883                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2884                             return -1;
2885                         }
2886
2887                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2888                         else                                pred+= abs_diff_pic_num;
2889                         pred &= h->max_pic_num - 1;
2890
2891                         frame_num = pic_num_extract(h, pred, &pic_structure);
2892
2893                         for(i= h->short_ref_count-1; i>=0; i--){
2894                             ref = h->short_ref[i];
2895                             assert(ref->reference);
2896                             assert(!ref->long_ref);
2897                             if(
2898                                    ref->frame_num == frame_num &&
2899                                    (ref->reference & pic_structure)
2900                               )
2901                                 break;
2902                         }
2903                         if(i>=0)
2904                             ref->pic_id= pred;
2905                     }else{
2906                         int long_idx;
2907                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2908
2909                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2910
2911                         if(long_idx>31){
2912                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2913                             return -1;
2914                         }
2915                         ref = h->long_ref[long_idx];
2916                         assert(!(ref && !ref->reference));
2917                         if(ref && (ref->reference & pic_structure)){
2918                             ref->pic_id= pic_id;
2919                             assert(ref->long_ref);
2920                             i=0;
2921                         }else{
2922                             i=-1;
2923                         }
2924                     }
2925
2926                     if (i < 0) {
2927                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2928                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2929                     } else {
2930                         for(i=index; i+1<h->ref_count[list]; i++){
2931                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2932                                 break;
2933                         }
2934                         for(; i > index; i--){
2935                             h->ref_list[list][i]= h->ref_list[list][i-1];
2936                         }
2937                         h->ref_list[list][index]= *ref;
2938                         if (FIELD_PICTURE){
2939                             pic_as_field(&h->ref_list[list][index], pic_structure);
2940                         }
2941                     }
2942                 }else{
2943                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2944                     return -1;
2945                 }
2946             }
2947         }
2948     }
2949     for(list=0; list<h->list_count; list++){
2950         for(index= 0; index < h->ref_count[list]; index++){
2951             if(!h->ref_list[list][index].data[0]){
2952                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2953                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2954             }
2955         }
2956     }
2957
2958     return 0;
2959 }
2960
2961 static void fill_mbaff_ref_list(H264Context *h){
2962     int list, i, j;
2963     for(list=0; list<2; list++){ //FIXME try list_count
2964         for(i=0; i<h->ref_count[list]; i++){
2965             Picture *frame = &h->ref_list[list][i];
2966             Picture *field = &h->ref_list[list][16+2*i];
2967             field[0] = *frame;
2968             for(j=0; j<3; j++)
2969                 field[0].linesize[j] <<= 1;
2970             field[0].reference = PICT_TOP_FIELD;
2971             field[0].poc= field[0].field_poc[0];
2972             field[1] = field[0];
2973             for(j=0; j<3; j++)
2974                 field[1].data[j] += frame->linesize[j];
2975             field[1].reference = PICT_BOTTOM_FIELD;
2976             field[1].poc= field[1].field_poc[1];
2977
2978             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2979             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2980             for(j=0; j<2; j++){
2981                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2982                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2983             }
2984         }
2985     }
2986     for(j=0; j<h->ref_count[1]; j++){
2987         for(i=0; i<h->ref_count[0]; i++)
2988             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2989         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2990         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2991     }
2992 }
2993
2994 static int pred_weight_table(H264Context *h){
2995     MpegEncContext * const s = &h->s;
2996     int list, i;
2997     int luma_def, chroma_def;
2998
2999     h->use_weight= 0;
3000     h->use_weight_chroma= 0;
3001     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3002     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3003     luma_def = 1<<h->luma_log2_weight_denom;
3004     chroma_def = 1<<h->chroma_log2_weight_denom;
3005
3006     for(list=0; list<2; list++){
3007         h->luma_weight_flag[list]   = 0;
3008         h->chroma_weight_flag[list] = 0;
3009         for(i=0; i<h->ref_count[list]; i++){
3010             int luma_weight_flag, chroma_weight_flag;
3011
3012             luma_weight_flag= get_bits1(&s->gb);
3013             if(luma_weight_flag){
3014                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3015                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3016                 if(   h->luma_weight[list][i] != luma_def
3017                    || h->luma_offset[list][i] != 0) {
3018                     h->use_weight= 1;
3019                     h->luma_weight_flag[list]= 1;
3020                 }
3021             }else{
3022                 h->luma_weight[list][i]= luma_def;
3023                 h->luma_offset[list][i]= 0;
3024             }
3025
3026             if(CHROMA){
3027                 chroma_weight_flag= get_bits1(&s->gb);
3028                 if(chroma_weight_flag){
3029                     int j;
3030                     for(j=0; j<2; j++){
3031                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3032                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3033                         if(   h->chroma_weight[list][i][j] != chroma_def
3034                            || h->chroma_offset[list][i][j] != 0) {
3035                             h->use_weight_chroma= 1;
3036                             h->chroma_weight_flag[list]= 1;
3037                         }
3038                     }
3039                 }else{
3040                     int j;
3041                     for(j=0; j<2; j++){
3042                         h->chroma_weight[list][i][j]= chroma_def;
3043                         h->chroma_offset[list][i][j]= 0;
3044                     }
3045                 }
3046             }
3047         }
3048         if(h->slice_type_nos != FF_B_TYPE) break;
3049     }
3050     h->use_weight= h->use_weight || h->use_weight_chroma;
3051     return 0;
3052 }
3053
3054 static void implicit_weight_table(H264Context *h){
3055     MpegEncContext * const s = &h->s;
3056     int ref0, ref1, i;
3057     int cur_poc = s->current_picture_ptr->poc;
3058
3059     for (i = 0; i < 2; i++) {
3060         h->luma_weight_flag[i]   = 0;
3061         h->chroma_weight_flag[i] = 0;
3062     }
3063
3064     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3065        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3066         h->use_weight= 0;
3067         h->use_weight_chroma= 0;
3068         return;
3069     }
3070
3071     h->use_weight= 2;
3072     h->use_weight_chroma= 2;
3073     h->luma_log2_weight_denom= 5;
3074     h->chroma_log2_weight_denom= 5;
3075
3076     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3077         int poc0 = h->ref_list[0][ref0].poc;
3078         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3079             int poc1 = h->ref_list[1][ref1].poc;
3080             int td = av_clip(poc1 - poc0, -128, 127);
3081             if(td){
3082                 int tb = av_clip(cur_poc - poc0, -128, 127);
3083                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3084                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3085                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3086                     h->implicit_weight[ref0][ref1] = 32;
3087                 else
3088                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3089             }else
3090                 h->implicit_weight[ref0][ref1] = 32;
3091         }
3092     }
3093 }
3094
3095 /**
3096  * Mark a picture as no longer needed for reference. The refmask
3097  * argument allows unreferencing of individual fields or the whole frame.
3098  * If the picture becomes entirely unreferenced, but is being held for
3099  * display purposes, it is marked as such.
3100  * @param refmask mask of fields to unreference; the mask is bitwise
3101  *                anded with the reference marking of pic
3102  * @return non-zero if pic becomes entirely unreferenced (except possibly
3103  *         for display purposes) zero if one of the fields remains in
3104  *         reference
3105  */
3106 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3107     int i;
3108     if (pic->reference &= refmask) {
3109         return 0;
3110     } else {
3111         for(i = 0; h->delayed_pic[i]; i++)
3112             if(pic == h->delayed_pic[i]){
3113                 pic->reference=DELAYED_PIC_REF;
3114                 break;
3115             }
3116         return 1;
3117     }
3118 }
3119
3120 /**
3121  * instantaneous decoder refresh.
3122  */
3123 static void idr(H264Context *h){
3124     int i;
3125
3126     for(i=0; i<16; i++){
3127         remove_long(h, i, 0);
3128     }
3129     assert(h->long_ref_count==0);
3130
3131     for(i=0; i<h->short_ref_count; i++){
3132         unreference_pic(h, h->short_ref[i], 0);
3133         h->short_ref[i]= NULL;
3134     }
3135     h->short_ref_count=0;
3136     h->prev_frame_num= 0;
3137     h->prev_frame_num_offset= 0;
3138     h->prev_poc_msb=
3139     h->prev_poc_lsb= 0;
3140 }
3141
3142 /* forget old pics after a seek */
3143 static void flush_dpb(AVCodecContext *avctx){
3144     H264Context *h= avctx->priv_data;
3145     int i;
3146     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3147         if(h->delayed_pic[i])
3148             h->delayed_pic[i]->reference= 0;
3149         h->delayed_pic[i]= NULL;
3150     }
3151     h->outputed_poc= INT_MIN;
3152     idr(h);
3153     if(h->s.current_picture_ptr)
3154         h->s.current_picture_ptr->reference= 0;
3155     h->s.first_field= 0;
3156     reset_sei(h);
3157     ff_mpeg_flush(avctx);
3158 }
3159
3160 /**
3161  * Find a Picture in the short term reference list by frame number.
3162  * @param frame_num frame number to search for
3163  * @param idx the index into h->short_ref where returned picture is found
3164  *            undefined if no picture found.
3165  * @return pointer to the found picture, or NULL if no pic with the provided
3166  *                 frame number is found
3167  */
3168 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3169     MpegEncContext * const s = &h->s;
3170     int i;
3171
3172     for(i=0; i<h->short_ref_count; i++){
3173         Picture *pic= h->short_ref[i];
3174         if(s->avctx->debug&FF_DEBUG_MMCO)
3175             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3176         if(pic->frame_num == frame_num) {
3177             *idx = i;
3178             return pic;
3179         }
3180     }
3181     return NULL;
3182 }
3183
3184 /**
3185  * Remove a picture from the short term reference list by its index in
3186  * that list.  This does no checking on the provided index; it is assumed
3187  * to be valid. Other list entries are shifted down.
3188  * @param i index into h->short_ref of picture to remove.
3189  */
3190 static void remove_short_at_index(H264Context *h, int i){
3191     assert(i >= 0 && i < h->short_ref_count);
3192     h->short_ref[i]= NULL;
3193     if (--h->short_ref_count)
3194         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3195 }
3196
3197 /**
3198  *
3199  * @return the removed picture or NULL if an error occurs
3200  */
3201 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3202     MpegEncContext * const s = &h->s;
3203     Picture *pic;
3204     int i;
3205
3206     if(s->avctx->debug&FF_DEBUG_MMCO)
3207         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3208
3209     pic = find_short(h, frame_num, &i);
3210     if (pic){
3211         if(unreference_pic(h, pic, ref_mask))
3212         remove_short_at_index(h, i);
3213     }
3214
3215     return pic;
3216 }
3217
3218 /**
3219  * Remove a picture from the long term reference list by its index in
3220  * that list.
3221  * @return the removed picture or NULL if an error occurs
3222  */
3223 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3224     Picture *pic;
3225
3226     pic= h->long_ref[i];
3227     if (pic){
3228         if(unreference_pic(h, pic, ref_mask)){
3229             assert(h->long_ref[i]->long_ref == 1);
3230             h->long_ref[i]->long_ref= 0;
3231             h->long_ref[i]= NULL;
3232             h->long_ref_count--;
3233         }
3234     }
3235
3236     return pic;
3237 }
3238
3239 /**
3240  * print short term list
3241  */
3242 static void print_short_term(H264Context *h) {
3243     uint32_t i;
3244     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3245         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3246         for(i=0; i<h->short_ref_count; i++){
3247             Picture *pic= h->short_ref[i];
3248             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3249         }
3250     }
3251 }
3252
3253 /**
3254  * print long term list
3255  */
3256 static void print_long_term(H264Context *h) {
3257     uint32_t i;
3258     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3259         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3260         for(i = 0; i < 16; i++){
3261             Picture *pic= h->long_ref[i];
3262             if (pic) {
3263                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3264             }
3265         }
3266     }
3267 }
3268
3269 /**
3270  * Executes the reference picture marking (memory management control operations).
3271  */
3272 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3273     MpegEncContext * const s = &h->s;
3274     int i, av_uninit(j);
3275     int current_ref_assigned=0;
3276     Picture *av_uninit(pic);
3277
3278     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3279         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3280
3281     for(i=0; i<mmco_count; i++){
3282         int av_uninit(structure), av_uninit(frame_num);
3283         if(s->avctx->debug&FF_DEBUG_MMCO)
3284             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3285
3286         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3287            || mmco[i].opcode == MMCO_SHORT2LONG){
3288             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3289             pic = find_short(h, frame_num, &j);
3290             if(!pic){
3291                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3292                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3293                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3294                 continue;
3295             }
3296         }
3297
3298         switch(mmco[i].opcode){
3299         case MMCO_SHORT2UNUSED:
3300             if(s->avctx->debug&FF_DEBUG_MMCO)
3301                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3302             remove_short(h, frame_num, structure ^ PICT_FRAME);
3303             break;
3304         case MMCO_SHORT2LONG:
3305                 if (h->long_ref[mmco[i].long_arg] != pic)
3306                     remove_long(h, mmco[i].long_arg, 0);
3307
3308                 remove_short_at_index(h, j);
3309                 h->long_ref[ mmco[i].long_arg ]= pic;
3310                 if (h->long_ref[ mmco[i].long_arg ]){
3311                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3312                     h->long_ref_count++;
3313                 }
3314             break;
3315         case MMCO_LONG2UNUSED:
3316             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3317             pic = h->long_ref[j];
3318             if (pic) {
3319                 remove_long(h, j, structure ^ PICT_FRAME);
3320             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3321                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3322             break;
3323         case MMCO_LONG:
3324                     // Comment below left from previous code as it is an interresting note.
3325                     /* First field in pair is in short term list or
3326                      * at a different long term index.
3327                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3328                      * Report the problem and keep the pair where it is,
3329                      * and mark this field valid.
3330                      */
3331
3332             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3333                 remove_long(h, mmco[i].long_arg, 0);
3334
3335                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3336                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3337                 h->long_ref_count++;
3338             }
3339
3340             s->current_picture_ptr->reference |= s->picture_structure;
3341             current_ref_assigned=1;
3342             break;
3343         case MMCO_SET_MAX_LONG:
3344             assert(mmco[i].long_arg <= 16);
3345             // just remove the long term which index is greater than new max
3346             for(j = mmco[i].long_arg; j<16; j++){
3347                 remove_long(h, j, 0);
3348             }
3349             break;
3350         case MMCO_RESET:
3351             while(h->short_ref_count){
3352                 remove_short(h, h->short_ref[0]->frame_num, 0);
3353             }
3354             for(j = 0; j < 16; j++) {
3355                 remove_long(h, j, 0);
3356             }
3357             s->current_picture_ptr->poc=
3358             s->current_picture_ptr->field_poc[0]=
3359             s->current_picture_ptr->field_poc[1]=
3360             h->poc_lsb=
3361             h->poc_msb=
3362             h->frame_num=
3363             s->current_picture_ptr->frame_num= 0;
3364             break;
3365         default: assert(0);
3366         }
3367     }
3368
3369     if (!current_ref_assigned) {
3370         /* Second field of complementary field pair; the first field of
3371          * which is already referenced. If short referenced, it
3372          * should be first entry in short_ref. If not, it must exist
3373          * in long_ref; trying to put it on the short list here is an
3374          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3375          */
3376         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3377             /* Just mark the second field valid */
3378             s->current_picture_ptr->reference = PICT_FRAME;
3379         } else if (s->current_picture_ptr->long_ref) {
3380             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3381                                              "assignment for second field "
3382                                              "in complementary field pair "
3383                                              "(first field is long term)\n");
3384         } else {
3385             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3386             if(pic){
3387                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3388             }
3389
3390             if(h->short_ref_count)
3391                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3392
3393             h->short_ref[0]= s->current_picture_ptr;
3394             h->short_ref_count++;
3395             s->current_picture_ptr->reference |= s->picture_structure;
3396         }
3397     }
3398
3399     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3400
3401         /* We have too many reference frames, probably due to corrupted
3402          * stream. Need to discard one frame. Prevents overrun of the
3403          * short_ref and long_ref buffers.
3404          */
3405         av_log(h->s.avctx, AV_LOG_ERROR,
3406                "number of reference frames exceeds max (probably "
3407                "corrupt input), discarding one\n");
3408
3409         if (h->long_ref_count && !h->short_ref_count) {
3410             for (i = 0; i < 16; ++i)
3411                 if (h->long_ref[i])
3412                     break;
3413
3414             assert(i < 16);
3415             remove_long(h, i, 0);
3416         } else {
3417             pic = h->short_ref[h->short_ref_count - 1];
3418             remove_short(h, pic->frame_num, 0);
3419         }
3420     }
3421
3422     print_short_term(h);
3423     print_long_term(h);
3424     return 0;
3425 }
3426
3427 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3428     MpegEncContext * const s = &h->s;
3429     int i;
3430
3431     h->mmco_index= 0;
3432     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3433         s->broken_link= get_bits1(gb) -1;
3434         if(get_bits1(gb)){
3435             h->mmco[0].opcode= MMCO_LONG;
3436             h->mmco[0].long_arg= 0;
3437             h->mmco_index= 1;
3438         }
3439     }else{
3440         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3441             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3442                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3443
3444                 h->mmco[i].opcode= opcode;
3445                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3446                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3447 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3448                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3449                         return -1;
3450                     }*/
3451                 }
3452                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3453                     unsigned int long_arg= get_ue_golomb_31(gb);
3454                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3455                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3456                         return -1;
3457                     }
3458                     h->mmco[i].long_arg= long_arg;
3459                 }
3460
3461                 if(opcode > (unsigned)MMCO_LONG){
3462                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3463                     return -1;
3464                 }
3465                 if(opcode == MMCO_END)
3466                     break;
3467             }
3468             h->mmco_index= i;
3469         }else{
3470             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3471
3472             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3473                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3474                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3475                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3476                 h->mmco_index= 1;
3477                 if (FIELD_PICTURE) {
3478                     h->mmco[0].short_pic_num *= 2;
3479                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3480                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3481                     h->mmco_index= 2;
3482                 }
3483             }
3484         }
3485     }
3486
3487     return 0;
3488 }
3489
3490 static int init_poc(H264Context *h){
3491     MpegEncContext * const s = &h->s;
3492     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3493     int field_poc[2];
3494     Picture *cur = s->current_picture_ptr;
3495
3496     h->frame_num_offset= h->prev_frame_num_offset;
3497     if(h->frame_num < h->prev_frame_num)
3498         h->frame_num_offset += max_frame_num;
3499
3500     if(h->sps.poc_type==0){
3501         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3502
3503         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3504             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3505         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3506             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3507         else
3508             h->poc_msb = h->prev_poc_msb;
3509 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3510         field_poc[0] =
3511         field_poc[1] = h->poc_msb + h->poc_lsb;
3512         if(s->picture_structure == PICT_FRAME)
3513             field_poc[1] += h->delta_poc_bottom;
3514     }else if(h->sps.poc_type==1){
3515         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3516         int i;
3517
3518         if(h->sps.poc_cycle_length != 0)
3519             abs_frame_num = h->frame_num_offset + h->frame_num;
3520         else
3521             abs_frame_num = 0;
3522
3523         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3524             abs_frame_num--;
3525
3526         expected_delta_per_poc_cycle = 0;
3527         for(i=0; i < h->sps.poc_cycle_length; i++)
3528             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3529
3530         if(abs_frame_num > 0){
3531             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3532             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3533
3534             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3535             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3536                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3537         } else
3538             expectedpoc = 0;
3539
3540         if(h->nal_ref_idc == 0)
3541             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3542
3543         field_poc[0] = expectedpoc + h->delta_poc[0];
3544         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3545
3546         if(s->picture_structure == PICT_FRAME)
3547             field_poc[1] += h->delta_poc[1];
3548     }else{
3549         int poc= 2*(h->frame_num_offset + h->frame_num);
3550
3551         if(!h->nal_ref_idc)
3552             poc--;
3553
3554         field_poc[0]= poc;
3555         field_poc[1]= poc;
3556     }
3557
3558     if(s->picture_structure != PICT_BOTTOM_FIELD)
3559         s->current_picture_ptr->field_poc[0]= field_poc[0];
3560     if(s->picture_structure != PICT_TOP_FIELD)
3561         s->current_picture_ptr->field_poc[1]= field_poc[1];
3562     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3563
3564     return 0;
3565 }
3566
3567
3568 /**
3569  * initialize scan tables
3570  */
3571 static void init_scan_tables(H264Context *h){
3572     MpegEncContext * const s = &h->s;
3573     int i;
3574     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3575         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3576         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3577     }else{
3578         for(i=0; i<16; i++){
3579 #define T(x) (x>>2) | ((x<<2) & 0xF)
3580             h->zigzag_scan[i] = T(zigzag_scan[i]);
3581             h-> field_scan[i] = T( field_scan[i]);
3582 #undef T
3583         }
3584     }
3585     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3586         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3587         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3588         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3589         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3590     }else{
3591         for(i=0; i<64; i++){
3592 #define T(x) (x>>3) | ((x&7)<<3)
3593             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3594             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3595             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3596             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3597 #undef T
3598         }
3599     }
3600     if(h->sps.transform_bypass){ //FIXME same ugly
3601         h->zigzag_scan_q0          = zigzag_scan;
3602         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3603         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3604         h->field_scan_q0           = field_scan;
3605         h->field_scan8x8_q0        = field_scan8x8;
3606         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3607     }else{
3608         h->zigzag_scan_q0          = h->zigzag_scan;
3609         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3610         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3611         h->field_scan_q0           = h->field_scan;
3612         h->field_scan8x8_q0        = h->field_scan8x8;
3613         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3614     }
3615 }
3616
3617 /**
3618  * Replicates H264 "master" context to thread contexts.
3619  */
3620 static void clone_slice(H264Context *dst, H264Context *src)
3621 {
3622     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3623     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3624     dst->s.current_picture      = src->s.current_picture;
3625     dst->s.linesize             = src->s.linesize;
3626     dst->s.uvlinesize           = src->s.uvlinesize;
3627     dst->s.first_field          = src->s.first_field;
3628
3629     dst->prev_poc_msb           = src->prev_poc_msb;
3630     dst->prev_poc_lsb           = src->prev_poc_lsb;
3631     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3632     dst->prev_frame_num         = src->prev_frame_num;
3633     dst->short_ref_count        = src->short_ref_count;
3634
3635     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3636     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3637     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3638     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3639
3640     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3641     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3642 }
3643
3644 /**
3645  * decodes a slice header.
3646  * This will also call MPV_common_init() and frame_start() as needed.
3647  *
3648  * @param h h264context
3649  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3650  *
3651  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3652  */
3653 static int decode_slice_header(H264Context *h, H264Context *h0){
3654     MpegEncContext * const s = &h->s;
3655     MpegEncContext * const s0 = &h0->s;
3656     unsigned int first_mb_in_slice;
3657     unsigned int pps_id;
3658     int num_ref_idx_active_override_flag;
3659     unsigned int slice_type, tmp, i, j;
3660     int default_ref_list_done = 0;
3661     int last_pic_structure;
3662
3663     s->dropable= h->nal_ref_idc == 0;
3664
3665     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3666         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3667         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3668     }else{
3669         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3670         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3671     }
3672
3673     first_mb_in_slice= get_ue_golomb(&s->gb);
3674
3675     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3676         h0->current_slice = 0;
3677         if (!s0->first_field)
3678             s->current_picture_ptr= NULL;
3679     }
3680
3681     slice_type= get_ue_golomb_31(&s->gb);
3682     if(slice_type > 9){
3683         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3684         return -1;
3685     }
3686     if(slice_type > 4){
3687         slice_type -= 5;
3688         h->slice_type_fixed=1;
3689     }else
3690         h->slice_type_fixed=0;
3691
3692     slice_type= golomb_to_pict_type[ slice_type ];
3693     if (slice_type == FF_I_TYPE
3694         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3695         default_ref_list_done = 1;
3696     }
3697     h->slice_type= slice_type;
3698     h->slice_type_nos= slice_type & 3;
3699
3700     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3701     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3702         av_log(h->s.avctx, AV_LOG_ERROR,
3703                "B picture before any references, skipping\n");
3704         return -1;
3705     }
3706
3707     pps_id= get_ue_golomb(&s->gb);
3708     if(pps_id>=MAX_PPS_COUNT){
3709         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3710         return -1;
3711     }
3712     if(!h0->pps_buffers[pps_id]) {
3713         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3714         return -1;
3715     }
3716     h->pps= *h0->pps_buffers[pps_id];
3717
3718     if(!h0->sps_buffers[h->pps.sps_id]) {
3719         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3720         return -1;
3721     }
3722     h->sps = *h0->sps_buffers[h->pps.sps_id];
3723
3724     if(h == h0 && h->dequant_coeff_pps != pps_id){
3725         h->dequant_coeff_pps = pps_id;
3726         init_dequant_tables(h);
3727     }
3728
3729     s->mb_width= h->sps.mb_width;
3730     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3731
3732     h->b_stride=  s->mb_width*4;
3733     h->b8_stride= s->mb_width*2;
3734
3735     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3736     if(h->sps.frame_mbs_only_flag)
3737         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3738     else
3739         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3740
3741     if (s->context_initialized
3742         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3743         if(h != h0)
3744             return -1;   // width / height changed during parallelized decoding
3745         free_tables(h);
3746         flush_dpb(s->avctx);
3747         MPV_common_end(s);
3748     }
3749     if (!s->context_initialized) {
3750         if(h != h0)
3751             return -1;  // we cant (re-)initialize context during parallel decoding
3752         if (MPV_common_init(s) < 0)
3753             return -1;
3754         s->first_field = 0;
3755
3756         init_scan_tables(h);
3757         alloc_tables(h);
3758
3759         for(i = 1; i < s->avctx->thread_count; i++) {
3760             H264Context *c;
3761             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3762             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3763             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3764             c->sps = h->sps;
3765             c->pps = h->pps;
3766             init_scan_tables(c);
3767             clone_tables(c, h);
3768         }
3769
3770         for(i = 0; i < s->avctx->thread_count; i++)
3771             if(context_init(h->thread_context[i]) < 0)
3772                 return -1;
3773
3774         s->avctx->width = s->width;
3775         s->avctx->height = s->height;
3776         s->avctx->sample_aspect_ratio= h->sps.sar;
3777         if(!s->avctx->sample_aspect_ratio.den)
3778             s->avctx->sample_aspect_ratio.den = 1;
3779
3780         if(h->sps.timing_info_present_flag){
3781             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3782             if(h->x264_build > 0 && h->x264_build < 44)
3783                 s->avctx->time_base.den *= 2;
3784             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3785                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3786         }
3787     }
3788
3789     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3790
3791     h->mb_mbaff = 0;
3792     h->mb_aff_frame = 0;
3793     last_pic_structure = s0->picture_structure;
3794     if(h->sps.frame_mbs_only_flag){
3795         s->picture_structure= PICT_FRAME;
3796     }else{
3797         if(get_bits1(&s->gb)) { //field_pic_flag
3798             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3799         } else {
3800             s->picture_structure= PICT_FRAME;
3801             h->mb_aff_frame = h->sps.mb_aff;
3802         }
3803     }
3804     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3805
3806     if(h0->current_slice == 0){
3807         while(h->frame_num !=  h->prev_frame_num &&
3808               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3809             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3810             if (frame_start(h) < 0)
3811                 return -1;
3812             h->prev_frame_num++;
3813             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3814             s->current_picture_ptr->frame_num= h->prev_frame_num;
3815             execute_ref_pic_marking(h, NULL, 0);
3816         }
3817
3818         /* See if we have a decoded first field looking for a pair... */
3819         if (s0->first_field) {
3820             assert(s0->current_picture_ptr);
3821             assert(s0->current_picture_ptr->data[0]);
3822             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3823
3824             /* figure out if we have a complementary field pair */
3825             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3826                 /*
3827                  * Previous field is unmatched. Don't display it, but let it
3828                  * remain for reference if marked as such.
3829                  */
3830                 s0->current_picture_ptr = NULL;
3831                 s0->first_field = FIELD_PICTURE;
3832
3833             } else {
3834                 if (h->nal_ref_idc &&
3835                         s0->current_picture_ptr->reference &&
3836                         s0->current_picture_ptr->frame_num != h->frame_num) {
3837                     /*
3838                      * This and previous field were reference, but had
3839                      * different frame_nums. Consider this field first in
3840                      * pair. Throw away previous field except for reference
3841                      * purposes.
3842                      */
3843                     s0->first_field = 1;
3844                     s0->current_picture_ptr = NULL;
3845
3846                 } else {
3847                     /* Second field in complementary pair */
3848                     s0->first_field = 0;
3849                 }
3850             }
3851
3852         } else {
3853             /* Frame or first field in a potentially complementary pair */
3854             assert(!s0->current_picture_ptr);
3855             s0->first_field = FIELD_PICTURE;
3856         }
3857
3858         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3859             s0->first_field = 0;
3860             return -1;
3861         }
3862     }
3863     if(h != h0)
3864         clone_slice(h, h0);
3865
3866     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3867
3868     assert(s->mb_num == s->mb_width * s->mb_height);
3869     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3870        first_mb_in_slice                    >= s->mb_num){
3871         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3872         return -1;
3873     }
3874     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3875     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3876     if (s->picture_structure == PICT_BOTTOM_FIELD)
3877         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3878     assert(s->mb_y < s->mb_height);
3879
3880     if(s->picture_structure==PICT_FRAME){
3881         h->curr_pic_num=   h->frame_num;
3882         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3883     }else{
3884         h->curr_pic_num= 2*h->frame_num + 1;
3885         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3886     }
3887
3888     if(h->nal_unit_type == NAL_IDR_SLICE){
3889         get_ue_golomb(&s->gb); /* idr_pic_id */
3890     }
3891
3892     if(h->sps.poc_type==0){
3893         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3894
3895         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3896             h->delta_poc_bottom= get_se_golomb(&s->gb);
3897         }
3898     }
3899
3900     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3901         h->delta_poc[0]= get_se_golomb(&s->gb);
3902
3903         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3904             h->delta_poc[1]= get_se_golomb(&s->gb);
3905     }
3906
3907     init_poc(h);
3908
3909     if(h->pps.redundant_pic_cnt_present){
3910         h->redundant_pic_count= get_ue_golomb(&s->gb);
3911     }
3912
3913     //set defaults, might be overridden a few lines later
3914     h->ref_count[0]= h->pps.ref_count[0];
3915     h->ref_count[1]= h->pps.ref_count[1];
3916
3917     if(h->slice_type_nos != FF_I_TYPE){
3918         if(h->slice_type_nos == FF_B_TYPE){
3919             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3920         }
3921         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3922
3923         if(num_ref_idx_active_override_flag){
3924             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3925             if(h->slice_type_nos==FF_B_TYPE)
3926                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3927
3928             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3929                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3930                 h->ref_count[0]= h->ref_count[1]= 1;
3931                 return -1;
3932             }
3933         }
3934         if(h->slice_type_nos == FF_B_TYPE)
3935             h->list_count= 2;
3936         else
3937             h->list_count= 1;
3938     }else
3939         h->list_count= 0;
3940
3941     if(!default_ref_list_done){
3942         fill_default_ref_list(h);
3943     }
3944
3945     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3946         return -1;
3947
3948     if(h->slice_type_nos!=FF_I_TYPE){
3949         s->last_picture_ptr= &h->ref_list[0][0];
3950         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3951     }
3952     if(h->slice_type_nos==FF_B_TYPE){
3953         s->next_picture_ptr= &h->ref_list[1][0];
3954         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3955     }
3956
3957     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3958        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3959         pred_weight_table(h);
3960     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3961         implicit_weight_table(h);
3962     else {
3963         h->use_weight = 0;
3964         for (i = 0; i < 2; i++) {
3965             h->luma_weight_flag[i]   = 0;
3966             h->chroma_weight_flag[i] = 0;
3967         }
3968     }
3969
3970     if(h->nal_ref_idc)
3971         decode_ref_pic_marking(h0, &s->gb);
3972
3973     if(FRAME_MBAFF)
3974         fill_mbaff_ref_list(h);
3975
3976     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3977         direct_dist_scale_factor(h);
3978     direct_ref_list_init(h);
3979
3980     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3981         tmp = get_ue_golomb_31(&s->gb);
3982         if(tmp > 2){
3983             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3984             return -1;
3985         }
3986         h->cabac_init_idc= tmp;
3987     }
3988
3989     h->last_qscale_diff = 0;
3990     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3991     if(tmp>51){
3992         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3993         return -1;
3994     }
3995     s->qscale= tmp;
3996     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3997     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3998     //FIXME qscale / qp ... stuff
3999     if(h->slice_type == FF_SP_TYPE){
4000         get_bits1(&s->gb); /* sp_for_switch_flag */
4001     }
4002     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4003         get_se_golomb(&s->gb); /* slice_qs_delta */
4004     }
4005
4006     h->deblocking_filter = 1;
4007     h->slice_alpha_c0_offset = 0;
4008     h->slice_beta_offset = 0;
4009     if( h->pps.deblocking_filter_parameters_present ) {
4010         tmp= get_ue_golomb_31(&s->gb);
4011         if(tmp > 2){
4012             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4013             return -1;
4014         }
4015         h->deblocking_filter= tmp;
4016         if(h->deblocking_filter < 2)
4017             h->deblocking_filter^= 1; // 1<->0
4018
4019         if( h->deblocking_filter ) {
4020             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4021             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4022         }
4023     }
4024
4025     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4026        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4027        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4028        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4029         h->deblocking_filter= 0;
4030
4031     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4032         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4033             /* Cheat slightly for speed:
4034                Do not bother to deblock across slices. */
4035             h->deblocking_filter = 2;
4036         } else {
4037             h0->max_contexts = 1;
4038             if(!h0->single_decode_warning) {
4039                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4040                 h0->single_decode_warning = 1;
4041             }
4042             if(h != h0)
4043                 return 1; // deblocking switched inside frame
4044         }
4045     }
4046
4047 #if 0 //FMO
4048     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4049         slice_group_change_cycle= get_bits(&s->gb, ?);
4050 #endif
4051
4052     h0->last_slice_type = slice_type;
4053     h->slice_num = ++h0->current_slice;
4054     if(h->slice_num >= MAX_SLICES){
4055         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4056     }
4057
4058     for(j=0; j<2; j++){
4059         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4060         ref2frm[0]=
4061         ref2frm[1]= -1;
4062         for(i=0; i<16; i++)
4063             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4064                           +(h->ref_list[j][i].reference&3);
4065         ref2frm[18+0]=
4066         ref2frm[18+1]= -1;
4067         for(i=16; i<48; i++)
4068             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4069                           +(h->ref_list[j][i].reference&3);
4070     }
4071
4072     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4073     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4074
4075     s->avctx->refs= h->sps.ref_frame_count;
4076
4077     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4078         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4079                h->slice_num,
4080                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4081                first_mb_in_slice,
4082                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4083                pps_id, h->frame_num,
4084                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4085                h->ref_count[0], h->ref_count[1],
4086                s->qscale,
4087                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4088                h->use_weight,
4089                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4090                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4091                );
4092     }
4093
4094     return 0;
4095 }
4096
4097 /**
4098  *
4099  */
4100 static inline int get_level_prefix(GetBitContext *gb){
4101     unsigned int buf;
4102     int log;
4103
4104     OPEN_READER(re, gb);
4105     UPDATE_CACHE(re, gb);
4106     buf=GET_CACHE(re, gb);
4107
4108     log= 32 - av_log2(buf);
4109 #ifdef TRACE
4110     print_bin(buf>>(32-log), log);
4111     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4112 #endif
4113
4114     LAST_SKIP_BITS(re, gb, log);
4115     CLOSE_READER(re, gb);
4116
4117     return log-1;
4118 }
4119
4120 static inline int get_dct8x8_allowed(H264Context *h){
4121     if(h->sps.direct_8x8_inference_flag)
4122         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4123     else
4124         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4125 }
4126
4127 /**
4128  * decodes a residual block.
4129  * @param n block index
4130  * @param scantable scantable
4131  * @param max_coeff number of coefficients in the block
4132  * @return <0 if an error occurred
4133  */
4134 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4135     MpegEncContext * const s = &h->s;
4136     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4137     int level[16];
4138     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4139
4140     //FIXME put trailing_onex into the context
4141
4142     if(n == CHROMA_DC_BLOCK_INDEX){
4143         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4144         total_coeff= coeff_token>>2;
4145     }else{
4146         if(n == LUMA_DC_BLOCK_INDEX){
4147             total_coeff= pred_non_zero_count(h, 0);
4148             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4149             total_coeff= coeff_token>>2;
4150         }else{
4151             total_coeff= pred_non_zero_count(h, n);
4152             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4153             total_coeff= coeff_token>>2;
4154             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4155         }
4156     }
4157
4158     //FIXME set last_non_zero?
4159
4160     if(total_coeff==0)
4161         return 0;
4162     if(total_coeff > (unsigned)max_coeff) {
4163         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4164         return -1;
4165     }
4166
4167     trailing_ones= coeff_token&3;
4168     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4169     assert(total_coeff<=16);
4170
4171     i = show_bits(gb, 3);
4172     skip_bits(gb, trailing_ones);
4173     level[0] = 1-((i&4)>>1);
4174     level[1] = 1-((i&2)   );
4175     level[2] = 1-((i&1)<<1);
4176
4177     if(trailing_ones<total_coeff) {
4178         int mask, prefix;
4179         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4180         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4181         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4182
4183         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4184         if(level_code >= 100){
4185             prefix= level_code - 100;
4186             if(prefix == LEVEL_TAB_BITS)
4187                 prefix += get_level_prefix(gb);
4188
4189             //first coefficient has suffix_length equal to 0 or 1
4190             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4191                 if(suffix_length)
4192                     level_code= (prefix<<1) + get_bits1(gb); //part
4193                 else
4194                     level_code= prefix; //part
4195             }else if(prefix==14){
4196                 if(suffix_length)
4197                     level_code= (prefix<<1) + get_bits1(gb); //part
4198                 else
4199                     level_code= prefix + get_bits(gb, 4); //part
4200             }else{
4201                 level_code= 30 + get_bits(gb, prefix-3); //part
4202                 if(prefix>=16)
4203                     level_code += (1<<(prefix-3))-4096;
4204             }
4205
4206             if(trailing_ones < 3) level_code += 2;
4207
4208             suffix_length = 2;
4209             mask= -(level_code&1);
4210             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4211         }else{
4212             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4213
4214             suffix_length = 1;
4215             if(level_code + 3U > 6U)
4216                 suffix_length++;
4217             level[trailing_ones]= level_code;
4218         }
4219
4220         //remaining coefficients have suffix_length > 0
4221         for(i=trailing_ones+1;i<total_coeff;i++) {
4222             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4223             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4224             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4225
4226             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4227             if(level_code >= 100){
4228                 prefix= level_code - 100;
4229                 if(prefix == LEVEL_TAB_BITS){
4230                     prefix += get_level_prefix(gb);
4231                 }
4232                 if(prefix<15){
4233                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4234                 }else{
4235                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4236                     if(prefix>=16)
4237                         level_code += (1<<(prefix-3))-4096;
4238                 }
4239                 mask= -(level_code&1);
4240                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4241             }
4242             level[i]= level_code;
4243
4244             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4245                 suffix_length++;
4246         }
4247     }
4248
4249     if(total_coeff == max_coeff)
4250         zeros_left=0;
4251     else{
4252         if(n == CHROMA_DC_BLOCK_INDEX)
4253             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4254         else
4255             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4256     }
4257
4258     coeff_num = zeros_left + total_coeff - 1;
4259     j = scantable[coeff_num];
4260     if(n > 24){
4261         block[j] = level[0];
4262         for(i=1;i<total_coeff;i++) {
4263             if(zeros_left <= 0)
4264                 run_before = 0;
4265             else if(zeros_left < 7){
4266                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4267             }else{
4268                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4269             }
4270             zeros_left -= run_before;
4271             coeff_num -= 1 + run_before;
4272             j= scantable[ coeff_num ];
4273
4274             block[j]= level[i];
4275         }
4276     }else{
4277         block[j] = (level[0] * qmul[j] + 32)>>6;
4278         for(i=1;i<total_coeff;i++) {
4279             if(zeros_left <= 0)
4280                 run_before = 0;
4281             else if(zeros_left < 7){
4282                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4283             }else{
4284                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4285             }
4286             zeros_left -= run_before;
4287             coeff_num -= 1 + run_before;
4288             j= scantable[ coeff_num ];
4289
4290             block[j]= (level[i] * qmul[j] + 32)>>6;
4291         }
4292     }
4293
4294     if(zeros_left<0){
4295         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4296         return -1;
4297     }
4298
4299     return 0;
4300 }
4301
4302 static void predict_field_decoding_flag(H264Context *h){
4303     MpegEncContext * const s = &h->s;
4304     const int mb_xy= h->mb_xy;
4305     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4306                 ? s->current_picture.mb_type[mb_xy-1]
4307                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4308                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4309                 : 0;
4310     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4311 }
4312
4313 /**
4314  * decodes a P_SKIP or B_SKIP macroblock
4315  */
4316 static void decode_mb_skip(H264Context *h){
4317     MpegEncContext * const s = &h->s;
4318     const int mb_xy= h->mb_xy;
4319     int mb_type=0;
4320
4321     memset(h->non_zero_count[mb_xy], 0, 16);
4322     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4323
4324     if(MB_FIELD)
4325         mb_type|= MB_TYPE_INTERLACED;
4326
4327     if( h->slice_type_nos == FF_B_TYPE )
4328     {
4329         // just for fill_caches. pred_direct_motion will set the real mb_type
4330         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4331
4332         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4333         pred_direct_motion(h, &mb_type);
4334         mb_type|= MB_TYPE_SKIP;
4335     }
4336     else
4337     {
4338         int mx, my;
4339         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4340
4341         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4342         pred_pskip_motion(h, &mx, &my);
4343         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4344         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4345     }
4346
4347     write_back_motion(h, mb_type);
4348     s->current_picture.mb_type[mb_xy]= mb_type;
4349     s->current_picture.qscale_table[mb_xy]= s->qscale;
4350     h->slice_table[ mb_xy ]= h->slice_num;
4351     h->prev_mb_skipped= 1;
4352 }
4353
4354 /**
4355  * decodes a macroblock
4356  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4357  */
4358 static int decode_mb_cavlc(H264Context *h){
4359     MpegEncContext * const s = &h->s;
4360     int mb_xy;
4361     int partition_count;
4362     unsigned int mb_type, cbp;
4363     int dct8x8_allowed= h->pps.transform_8x8_mode;
4364
4365     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4366
4367     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4368     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4369                 down the code */
4370     if(h->slice_type_nos != FF_I_TYPE){
4371         if(s->mb_skip_run==-1)
4372             s->mb_skip_run= get_ue_golomb(&s->gb);
4373
4374         if (s->mb_skip_run--) {
4375             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4376                 if(s->mb_skip_run==0)
4377                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4378                 else
4379                     predict_field_decoding_flag(h);
4380             }
4381             decode_mb_skip(h);
4382             return 0;
4383         }
4384     }
4385     if(FRAME_MBAFF){
4386         if( (s->mb_y&1) == 0 )
4387             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4388     }
4389
4390     h->prev_mb_skipped= 0;
4391
4392     mb_type= get_ue_golomb(&s->gb);
4393     if(h->slice_type_nos == FF_B_TYPE){
4394         if(mb_type < 23){
4395             partition_count= b_mb_type_info[mb_type].partition_count;
4396             mb_type=         b_mb_type_info[mb_type].type;
4397         }else{
4398             mb_type -= 23;
4399             goto decode_intra_mb;
4400         }
4401     }else if(h->slice_type_nos == FF_P_TYPE){
4402         if(mb_type < 5){
4403             partition_count= p_mb_type_info[mb_type].partition_count;
4404             mb_type=         p_mb_type_info[mb_type].type;
4405         }else{
4406             mb_type -= 5;
4407             goto decode_intra_mb;
4408         }
4409     }else{
4410        assert(h->slice_type_nos == FF_I_TYPE);
4411         if(h->slice_type == FF_SI_TYPE && mb_type)
4412             mb_type--;
4413 decode_intra_mb:
4414         if(mb_type > 25){
4415             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4416             return -1;
4417         }
4418         partition_count=0;
4419         cbp= i_mb_type_info[mb_type].cbp;
4420         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4421         mb_type= i_mb_type_info[mb_type].type;
4422     }
4423
4424     if(MB_FIELD)
4425         mb_type |= MB_TYPE_INTERLACED;
4426
4427     h->slice_table[ mb_xy ]= h->slice_num;
4428
4429     if(IS_INTRA_PCM(mb_type)){
4430         unsigned int x;
4431
4432         // We assume these blocks are very rare so we do not optimize it.
4433         align_get_bits(&s->gb);
4434
4435         // The pixels are stored in the same order as levels in h->mb array.
4436         for(x=0; x < (CHROMA ? 384 : 256); x++){
4437             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4438         }
4439
4440         // In deblocking, the quantizer is 0
4441         s->current_picture.qscale_table[mb_xy]= 0;
4442         // All coeffs are present
4443         memset(h->non_zero_count[mb_xy], 16, 16);
4444
4445         s->current_picture.mb_type[mb_xy]= mb_type;
4446         return 0;
4447     }
4448
4449     if(MB_MBAFF){
4450         h->ref_count[0] <<= 1;
4451         h->ref_count[1] <<= 1;
4452     }
4453
4454     fill_caches(h, mb_type, 0);
4455
4456     //mb_pred
4457     if(IS_INTRA(mb_type)){
4458         int pred_mode;
4459 //            init_top_left_availability(h);
4460         if(IS_INTRA4x4(mb_type)){
4461             int i;
4462             int di = 1;
4463             if(dct8x8_allowed && get_bits1(&s->gb)){
4464                 mb_type |= MB_TYPE_8x8DCT;
4465                 di = 4;
4466             }
4467
4468 //                fill_intra4x4_pred_table(h);
4469             for(i=0; i<16; i+=di){
4470                 int mode= pred_intra_mode(h, i);
4471
4472                 if(!get_bits1(&s->gb)){
4473                     const int rem_mode= get_bits(&s->gb, 3);
4474                     mode = rem_mode + (rem_mode >= mode);
4475                 }
4476
4477                 if(di==4)
4478                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4479                 else
4480                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4481             }
4482             write_back_intra_pred_mode(h);
4483             if( check_intra4x4_pred_mode(h) < 0)
4484                 return -1;
4485         }else{
4486             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4487             if(h->intra16x16_pred_mode < 0)
4488                 return -1;
4489         }
4490         if(CHROMA){
4491             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4492             if(pred_mode < 0)
4493                 return -1;
4494             h->chroma_pred_mode= pred_mode;
4495         }
4496     }else if(partition_count==4){
4497         int i, j, sub_partition_count[4], list, ref[2][4];
4498
4499         if(h->slice_type_nos == FF_B_TYPE){
4500             for(i=0; i<4; i++){
4501                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4502                 if(h->sub_mb_type[i] >=13){
4503                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4504                     return -1;
4505                 }
4506                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4507                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4508             }
4509             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4510                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4511                 pred_direct_motion(h, &mb_type);
4512                 h->ref_cache[0][scan8[4]] =
4513                 h->ref_cache[1][scan8[4]] =
4514                 h->ref_cache[0][scan8[12]] =
4515                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4516             }
4517         }else{
4518             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4519             for(i=0; i<4; i++){
4520                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4521                 if(h->sub_mb_type[i] >=4){
4522                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4523                     return -1;
4524                 }
4525                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4526                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4527             }
4528         }
4529
4530         for(list=0; list<h->list_count; list++){
4531             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4532             for(i=0; i<4; i++){
4533                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4534                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4535                     unsigned int tmp;
4536                     if(ref_count == 1){
4537                         tmp= 0;
4538                     }else if(ref_count == 2){
4539                         tmp= get_bits1(&s->gb)^1;
4540                     }else{
4541                         tmp= get_ue_golomb_31(&s->gb);
4542                         if(tmp>=ref_count){
4543                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4544                             return -1;
4545                         }
4546                     }
4547                     ref[list][i]= tmp;
4548                 }else{
4549                  //FIXME
4550                     ref[list][i] = -1;
4551                 }
4552             }
4553         }
4554
4555         if(dct8x8_allowed)
4556             dct8x8_allowed = get_dct8x8_allowed(h);
4557
4558         for(list=0; list<h->list_count; list++){
4559             for(i=0; i<4; i++){
4560                 if(IS_DIRECT(h->sub_mb_type[i])) {
4561                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4562                     continue;
4563                 }
4564                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4565                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4566
4567                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4568                     const int sub_mb_type= h->sub_mb_type[i];
4569                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4570                     for(j=0; j<sub_partition_count[i]; j++){
4571                         int mx, my;
4572                         const int index= 4*i + block_width*j;
4573                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4574                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4575                         mx += get_se_golomb(&s->gb);
4576                         my += get_se_golomb(&s->gb);
4577                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4578
4579                         if(IS_SUB_8X8(sub_mb_type)){
4580                             mv_cache[ 1 ][0]=
4581                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4582                             mv_cache[ 1 ][1]=
4583                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4584                         }else if(IS_SUB_8X4(sub_mb_type)){
4585                             mv_cache[ 1 ][0]= mx;
4586                             mv_cache[ 1 ][1]= my;
4587                         }else if(IS_SUB_4X8(sub_mb_type)){
4588                             mv_cache[ 8 ][0]= mx;
4589                             mv_cache[ 8 ][1]= my;
4590                         }
4591                         mv_cache[ 0 ][0]= mx;
4592                         mv_cache[ 0 ][1]= my;
4593                     }
4594                 }else{
4595                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4596                     p[0] = p[1]=
4597                     p[8] = p[9]= 0;
4598                 }
4599             }
4600         }
4601     }else if(IS_DIRECT(mb_type)){
4602         pred_direct_motion(h, &mb_type);
4603         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4604     }else{
4605         int list, mx, my, i;
4606          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4607         if(IS_16X16(mb_type)){
4608             for(list=0; list<h->list_count; list++){
4609                     unsigned int val;
4610                     if(IS_DIR(mb_type, 0, list)){
4611                         if(h->ref_count[list]==1){
4612                             val= 0;
4613                         }else if(h->ref_count[list]==2){
4614                             val= get_bits1(&s->gb)^1;
4615                         }else{
4616                             val= get_ue_golomb_31(&s->gb);
4617                             if(val >= h->ref_count[list]){
4618                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4619                                 return -1;
4620                             }
4621                         }
4622                     }else
4623                         val= LIST_NOT_USED&0xFF;
4624                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4625             }
4626             for(list=0; list<h->list_count; list++){
4627                 unsigned int val;
4628                 if(IS_DIR(mb_type, 0, list)){
4629                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4630                     mx += get_se_golomb(&s->gb);
4631                     my += get_se_golomb(&s->gb);
4632                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4633
4634                     val= pack16to32(mx,my);
4635                 }else
4636                     val=0;
4637                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4638             }
4639         }
4640         else if(IS_16X8(mb_type)){
4641             for(list=0; list<h->list_count; list++){
4642                     for(i=0; i<2; i++){
4643                         unsigned int val;
4644                         if(IS_DIR(mb_type, i, list)){
4645                             if(h->ref_count[list] == 1){
4646                                 val= 0;
4647                             }else if(h->ref_count[list] == 2){
4648                                 val= get_bits1(&s->gb)^1;
4649                             }else{
4650                                 val= get_ue_golomb_31(&s->gb);
4651                                 if(val >= h->ref_count[list]){
4652                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4653                                     return -1;
4654                                 }
4655                             }
4656                         }else
4657                             val= LIST_NOT_USED&0xFF;
4658                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4659                     }
4660             }
4661             for(list=0; list<h->list_count; list++){
4662                 for(i=0; i<2; i++){
4663                     unsigned int val;
4664                     if(IS_DIR(mb_type, i, list)){
4665                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4666                         mx += get_se_golomb(&s->gb);
4667                         my += get_se_golomb(&s->gb);
4668                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4669
4670                         val= pack16to32(mx,my);
4671                     }else
4672                         val=0;
4673                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4674                 }
4675             }
4676         }else{
4677             assert(IS_8X16(mb_type));
4678             for(list=0; list<h->list_count; list++){
4679                     for(i=0; i<2; i++){
4680                         unsigned int val;
4681                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4682                             if(h->ref_count[list]==1){
4683                                 val= 0;
4684                             }else if(h->ref_count[list]==2){
4685                                 val= get_bits1(&s->gb)^1;
4686                             }else{
4687                                 val= get_ue_golomb_31(&s->gb);
4688                                 if(val >= h->ref_count[list]){
4689                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4690                                     return -1;
4691                                 }
4692                             }
4693                         }else
4694                             val= LIST_NOT_USED&0xFF;
4695                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4696                     }
4697             }
4698             for(list=0; list<h->list_count; list++){
4699                 for(i=0; i<2; i++){
4700                     unsigned int val;
4701                     if(IS_DIR(mb_type, i, list)){
4702                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4703                         mx += get_se_golomb(&s->gb);
4704                         my += get_se_golomb(&s->gb);
4705                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4706
4707                         val= pack16to32(mx,my);
4708                     }else
4709                         val=0;
4710                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4711                 }
4712             }
4713         }
4714     }
4715
4716     if(IS_INTER(mb_type))
4717         write_back_motion(h, mb_type);
4718
4719     if(!IS_INTRA16x16(mb_type)){
4720         cbp= get_ue_golomb(&s->gb);
4721         if(cbp > 47){
4722             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4723             return -1;
4724         }
4725
4726         if(CHROMA){
4727             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4728             else                     cbp= golomb_to_inter_cbp   [cbp];
4729         }else{
4730             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4731             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4732         }
4733     }
4734     h->cbp = cbp;
4735
4736     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4737         if(get_bits1(&s->gb)){
4738             mb_type |= MB_TYPE_8x8DCT;
4739             h->cbp_table[mb_xy]= cbp;
4740         }
4741     }
4742     s->current_picture.mb_type[mb_xy]= mb_type;
4743
4744     if(cbp || IS_INTRA16x16(mb_type)){
4745         int i8x8, i4x4, chroma_idx;
4746         int dquant;
4747         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4748         const uint8_t *scan, *scan8x8, *dc_scan;
4749
4750 //        fill_non_zero_count_cache(h);
4751
4752         if(IS_INTERLACED(mb_type)){
4753             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4754             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4755             dc_scan= luma_dc_field_scan;
4756         }else{
4757             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4758             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4759             dc_scan= luma_dc_zigzag_scan;
4760         }
4761
4762         dquant= get_se_golomb(&s->gb);
4763
4764         if( dquant > 25 || dquant < -26 ){
4765             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4766             return -1;
4767         }
4768
4769         s->qscale += dquant;
4770         if(((unsigned)s->qscale) > 51){
4771             if(s->qscale<0) s->qscale+= 52;
4772             else            s->qscale-= 52;
4773         }
4774
4775         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4776         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4777         if(IS_INTRA16x16(mb_type)){
4778             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4779                 return -1; //FIXME continue if partitioned and other return -1 too
4780             }
4781
4782             assert((cbp&15) == 0 || (cbp&15) == 15);
4783
4784             if(cbp&15){
4785                 for(i8x8=0; i8x8<4; i8x8++){
4786                     for(i4x4=0; i4x4<4; i4x4++){
4787                         const int index= i4x4 + 4*i8x8;
4788                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4789                             return -1;
4790                         }
4791                     }
4792                 }
4793             }else{
4794                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4795             }
4796         }else{
4797             for(i8x8=0; i8x8<4; i8x8++){
4798                 if(cbp & (1<<i8x8)){
4799                     if(IS_8x8DCT(mb_type)){
4800                         DCTELEM *buf = &h->mb[64*i8x8];
4801                         uint8_t *nnz;
4802                         for(i4x4=0; i4x4<4; i4x4++){
4803                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4804                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4805                                 return -1;
4806                         }
4807                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4808                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4809                     }else{
4810                         for(i4x4=0; i4x4<4; i4x4++){
4811                             const int index= i4x4 + 4*i8x8;
4812
4813                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4814                                 return -1;
4815                             }
4816                         }
4817                     }
4818                 }else{
4819                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4820                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4821                 }
4822             }
4823         }
4824
4825         if(cbp&0x30){
4826             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4827                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4828                     return -1;
4829                 }
4830         }
4831
4832         if(cbp&0x20){
4833             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4834                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4835                 for(i4x4=0; i4x4<4; i4x4++){
4836                     const int index= 16 + 4*chroma_idx + i4x4;
4837                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4838                         return -1;
4839                     }
4840                 }
4841             }
4842         }else{
4843             uint8_t * const nnz= &h->non_zero_count_cache[0];
4844             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4845             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4846         }
4847     }else{
4848         uint8_t * const nnz= &h->non_zero_count_cache[0];
4849         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4850         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4851         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4852     }
4853     s->current_picture.qscale_table[mb_xy]= s->qscale;
4854     write_back_non_zero_count(h);
4855
4856     if(MB_MBAFF){
4857         h->ref_count[0] >>= 1;
4858         h->ref_count[1] >>= 1;
4859     }
4860
4861     return 0;
4862 }
4863
4864 static int decode_cabac_field_decoding_flag(H264Context *h) {
4865     MpegEncContext * const s = &h->s;
4866     const int mb_x = s->mb_x;
4867     const int mb_y = s->mb_y & ~1;
4868     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4869     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4870
4871     unsigned int ctx = 0;
4872
4873     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4874         ctx += 1;
4875     }
4876     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4877         ctx += 1;
4878     }
4879
4880     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4881 }
4882
4883 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4884     uint8_t *state= &h->cabac_state[ctx_base];
4885     int mb_type;
4886
4887     if(intra_slice){
4888         MpegEncContext * const s = &h->s;
4889         const int mba_xy = h->left_mb_xy[0];
4890         const int mbb_xy = h->top_mb_xy;
4891         int ctx=0;
4892         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4893             ctx++;
4894         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4895             ctx++;
4896         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4897             return 0;   /* I4x4 */
4898         state += 2;
4899     }else{
4900         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4901             return 0;   /* I4x4 */
4902     }
4903
4904     if( get_cabac_terminate( &h->cabac ) )
4905         return 25;  /* PCM */
4906
4907     mb_type = 1; /* I16x16 */
4908     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4909     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4910         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4911     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4912     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4913     return mb_type;
4914 }
4915
4916 static int decode_cabac_mb_type_b( H264Context *h ) {
4917     MpegEncContext * const s = &h->s;
4918
4919         const int mba_xy = h->left_mb_xy[0];
4920         const int mbb_xy = h->top_mb_xy;
4921         int ctx = 0;
4922         int bits;
4923         assert(h->slice_type_nos == FF_B_TYPE);
4924
4925         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4926             ctx++;
4927         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4928             ctx++;
4929
4930         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4931             return 0; /* B_Direct_16x16 */
4932
4933         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4934             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4935         }
4936
4937         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4938         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4939         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4940         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4941         if( bits < 8 )
4942             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4943         else if( bits == 13 ) {
4944             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4945         } else if( bits == 14 )
4946             return 11; /* B_L1_L0_8x16 */
4947         else if( bits == 15 )
4948             return 22; /* B_8x8 */
4949
4950         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4951         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4952 }
4953
4954 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4955     MpegEncContext * const s = &h->s;
4956     int mba_xy, mbb_xy;
4957     int ctx = 0;
4958
4959     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4960         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4961         mba_xy = mb_xy - 1;
4962         if( (mb_y&1)
4963             && h->slice_table[mba_xy] == h->slice_num
4964             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4965             mba_xy += s->mb_stride;
4966         if( MB_FIELD ){
4967             mbb_xy = mb_xy - s->mb_stride;
4968             if( !(mb_y&1)
4969                 && h->slice_table[mbb_xy] == h->slice_num
4970                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4971                 mbb_xy -= s->mb_stride;
4972         }else
4973             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4974     }else{
4975         int mb_xy = h->mb_xy;
4976         mba_xy = mb_xy - 1;
4977         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4978     }
4979
4980     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4981         ctx++;
4982     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4983         ctx++;
4984
4985     if( h->slice_type_nos == FF_B_TYPE )
4986         ctx += 13;
4987     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4988 }
4989
4990 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4991     int mode = 0;
4992
4993     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4994         return pred_mode;
4995
4996     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4997     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4998     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4999
5000     if( mode >= pred_mode )
5001         return mode + 1;
5002     else
5003         return mode;
5004 }
5005
5006 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5007     const int mba_xy = h->left_mb_xy[0];
5008     const int mbb_xy = h->top_mb_xy;
5009
5010     int ctx = 0;
5011
5012     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5013     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5014         ctx++;
5015
5016     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5017         ctx++;
5018
5019     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5020         return 0;
5021
5022     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5023         return 1;
5024     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5025         return 2;
5026     else
5027         return 3;
5028 }
5029
5030 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5031     int cbp_b, cbp_a, ctx, cbp = 0;
5032
5033     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5034     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5035
5036     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5037     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5038     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5039     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5040     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5041     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5042     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5043     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5044     return cbp;
5045 }
5046 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5047     int ctx;
5048     int cbp_a, cbp_b;
5049
5050     cbp_a = (h->left_cbp>>4)&0x03;
5051     cbp_b = (h-> top_cbp>>4)&0x03;
5052
5053     ctx = 0;
5054     if( cbp_a > 0 ) ctx++;
5055     if( cbp_b > 0 ) ctx += 2;
5056     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5057         return 0;
5058
5059     ctx = 4;
5060     if( cbp_a == 2 ) ctx++;
5061     if( cbp_b == 2 ) ctx += 2;
5062     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5063 }
5064 static int decode_cabac_mb_dqp( H264Context *h) {
5065     int   ctx= h->last_qscale_diff != 0;
5066     int   val = 0;
5067
5068     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5069         ctx= 2+(ctx>>1);
5070         val++;
5071         if(val > 102) //prevent infinite loop
5072             return INT_MIN;
5073     }
5074
5075     if( val&0x01 )
5076         return   (val + 1)>>1 ;
5077     else
5078         return -((val + 1)>>1);
5079 }
5080 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5081     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5082         return 0;   /* 8x8 */
5083     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5084         return 1;   /* 8x4 */
5085     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5086         return 2;   /* 4x8 */
5087     return 3;       /* 4x4 */
5088 }
5089 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5090     int type;
5091     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5092         return 0;   /* B_Direct_8x8 */
5093     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5094         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5095     type = 3;
5096     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5097         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5098             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5099         type += 4;
5100     }
5101     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5102     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5103     return type;
5104 }
5105
5106 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5107     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5108 }
5109
5110 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5111     int refa = h->ref_cache[list][scan8[n] - 1];
5112     int refb = h->ref_cache[list][scan8[n] - 8];
5113     int ref  = 0;
5114     int ctx  = 0;
5115
5116     if( h->slice_type_nos == FF_B_TYPE) {
5117         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5118             ctx++;
5119         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5120             ctx += 2;
5121     } else {
5122         if( refa > 0 )
5123             ctx++;
5124         if( refb > 0 )
5125             ctx += 2;
5126     }
5127
5128     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5129         ref++;
5130         ctx = (ctx>>2)+4;
5131         if(ref >= 32 /*h->ref_list[list]*/){
5132             return -1;
5133         }
5134     }
5135     return ref;
5136 }
5137
5138 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5139     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5140                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5141     int ctxbase = (l == 0) ? 40 : 47;
5142     int mvd;
5143     int ctx = (amvd>2) + (amvd>32);
5144
5145     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5146         return 0;
5147
5148     mvd= 1;
5149     ctx= 3;
5150     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5151         mvd++;
5152         if( ctx < 6 )
5153             ctx++;
5154     }
5155
5156     if( mvd >= 9 ) {
5157         int k = 3;
5158         while( get_cabac_bypass( &h->cabac ) ) {
5159             mvd += 1 << k;
5160             k++;
5161             if(k>24){
5162                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5163                 return INT_MIN;
5164             }
5165         }
5166         while( k-- ) {
5167             if( get_cabac_bypass( &h->cabac ) )
5168                 mvd += 1 << k;
5169         }
5170     }
5171     return get_cabac_bypass_sign( &h->cabac, -mvd );
5172 }
5173
5174 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5175     int nza, nzb;
5176     int ctx = 0;
5177
5178     if( is_dc ) {
5179         if( cat == 0 ) {
5180             nza = h->left_cbp&0x100;
5181             nzb = h-> top_cbp&0x100;
5182         } else {
5183             nza = (h->left_cbp>>(6+idx))&0x01;
5184             nzb = (h-> top_cbp>>(6+idx))&0x01;
5185         }
5186     } else {
5187         assert(cat == 1 || cat == 2 || cat == 4);
5188         nza = h->non_zero_count_cache[scan8[idx] - 1];
5189         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5190     }
5191
5192     if( nza > 0 )
5193         ctx++;
5194
5195     if( nzb > 0 )
5196         ctx += 2;
5197
5198     return ctx + 4 * cat;
5199 }
5200
5201 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5202     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5203     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5204     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5205     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5206 };
5207
5208 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5209     static const int significant_coeff_flag_offset[2][6] = {
5210       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5211       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5212     };
5213     static const int last_coeff_flag_offset[2][6] = {
5214       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5215       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5216     };
5217     static const int coeff_abs_level_m1_offset[6] = {
5218         227+0, 227+10, 227+20, 227+30, 227+39, 426
5219     };
5220     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5221       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5222         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5223         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5224        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5225       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5226         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5227         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5228         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5229     };
5230     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5231      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5232      * map node ctx => cabac ctx for level=1 */
5233     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5234     /* map node ctx => cabac ctx for level>1 */
5235     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5236     static const uint8_t coeff_abs_level_transition[2][8] = {
5237     /* update node ctx after decoding a level=1 */
5238         { 1, 2, 3, 3, 4, 5, 6, 7 },
5239     /* update node ctx after decoding a level>1 */
5240         { 4, 4, 4, 4, 5, 6, 7, 7 }
5241     };
5242
5243     int index[64];
5244
5245     int av_unused last;
5246     int coeff_count = 0;
5247     int node_ctx = 0;
5248
5249     uint8_t *significant_coeff_ctx_base;
5250     uint8_t *last_coeff_ctx_base;
5251     uint8_t *abs_level_m1_ctx_base;
5252
5253 #if !ARCH_X86
5254 #define CABAC_ON_STACK
5255 #endif
5256 #ifdef CABAC_ON_STACK
5257 #define CC &cc
5258     CABACContext cc;
5259     cc.range     = h->cabac.range;
5260     cc.low       = h->cabac.low;
5261     cc.bytestream= h->cabac.bytestream;
5262 #else
5263 #define CC &h->cabac
5264 #endif
5265
5266
5267     /* cat: 0-> DC 16x16  n = 0
5268      *      1-> AC 16x16  n = luma4x4idx
5269      *      2-> Luma4x4   n = luma4x4idx
5270      *      3-> DC Chroma n = iCbCr
5271      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5272      *      5-> Luma8x8   n = 4 * luma8x8idx
5273      */
5274
5275     /* read coded block flag */
5276     if( is_dc || cat != 5 ) {
5277         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5278             if( !is_dc )
5279                 h->non_zero_count_cache[scan8[n]] = 0;
5280
5281 #ifdef CABAC_ON_STACK
5282             h->cabac.range     = cc.range     ;
5283             h->cabac.low       = cc.low       ;
5284             h->cabac.bytestream= cc.bytestream;
5285 #endif
5286             return;
5287         }
5288     }
5289
5290     significant_coeff_ctx_base = h->cabac_state
5291         + significant_coeff_flag_offset[MB_FIELD][cat];
5292     last_coeff_ctx_base = h->cabac_state
5293         + last_coeff_flag_offset[MB_FIELD][cat];
5294     abs_level_m1_ctx_base = h->cabac_state
5295         + coeff_abs_level_m1_offset[cat];
5296
5297     if( !is_dc && cat == 5 ) {
5298 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5299         for(last= 0; last < coefs; last++) { \
5300             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5301             if( get_cabac( CC, sig_ctx )) { \
5302                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5303                 index[coeff_count++] = last; \
5304                 if( get_cabac( CC, last_ctx ) ) { \
5305                     last= max_coeff; \
5306                     break; \
5307                 } \
5308             } \
5309         }\
5310         if( last == max_coeff -1 ) {\
5311             index[coeff_count++] = last;\
5312         }
5313         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5314 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5315         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5316     } else {
5317         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5318 #else
5319         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5320     } else {
5321         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5322 #endif
5323     }
5324     assert(coeff_count > 0);
5325
5326     if( is_dc ) {
5327         if( cat == 0 )
5328             h->cbp_table[h->mb_xy] |= 0x100;
5329         else
5330             h->cbp_table[h->mb_xy] |= 0x40 << n;
5331     } else {
5332         if( cat == 5 )
5333             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5334         else {
5335             assert( cat == 1 || cat == 2 || cat == 4 );
5336             h->non_zero_count_cache[scan8[n]] = coeff_count;
5337         }
5338     }
5339
5340     do {
5341         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5342
5343         int j= scantable[index[--coeff_count]];
5344
5345         if( get_cabac( CC, ctx ) == 0 ) {
5346             node_ctx = coeff_abs_level_transition[0][node_ctx];
5347             if( is_dc ) {
5348                 block[j] = get_cabac_bypass_sign( CC, -1);
5349             }else{
5350                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5351             }
5352         } else {
5353             int coeff_abs = 2;
5354             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5355             node_ctx = coeff_abs_level_transition[1][node_ctx];
5356
5357             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5358                 coeff_abs++;
5359             }
5360
5361             if( coeff_abs >= 15 ) {
5362                 int j = 0;
5363                 while( get_cabac_bypass( CC ) ) {
5364                     j++;
5365                 }
5366
5367                 coeff_abs=1;
5368                 while( j-- ) {
5369                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5370                 }
5371                 coeff_abs+= 14;
5372             }
5373
5374             if( is_dc ) {
5375                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5376             }else{
5377                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5378             }
5379         }
5380     } while( coeff_count );
5381 #ifdef CABAC_ON_STACK
5382             h->cabac.range     = cc.range     ;
5383             h->cabac.low       = cc.low       ;
5384             h->cabac.bytestream= cc.bytestream;
5385 #endif
5386
5387 }
5388
5389 #if !CONFIG_SMALL
5390 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5391     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5392 }
5393
5394 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5395     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5396 }
5397 #endif
5398
5399 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5400 #if CONFIG_SMALL
5401     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5402 #else
5403     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5404     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5405 #endif
5406 }
5407
5408 static inline void compute_mb_neighbors(H264Context *h)
5409 {
5410     MpegEncContext * const s = &h->s;
5411     const int mb_xy  = h->mb_xy;
5412     h->top_mb_xy     = mb_xy - s->mb_stride;
5413     h->left_mb_xy[0] = mb_xy - 1;
5414     if(FRAME_MBAFF){
5415         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5416         const int top_pair_xy      = pair_xy     - s->mb_stride;
5417         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5418         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5419         const int curr_mb_field_flag = MB_FIELD;
5420         const int bottom = (s->mb_y & 1);
5421
5422         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5423             h->top_mb_xy -= s->mb_stride;
5424         }
5425         if (!left_mb_field_flag == curr_mb_field_flag) {
5426             h->left_mb_xy[0] = pair_xy - 1;
5427         }
5428     } else if (FIELD_PICTURE) {
5429         h->top_mb_xy -= s->mb_stride;
5430     }
5431     return;
5432 }
5433
5434 /**
5435  * decodes a macroblock
5436  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5437  */
5438 static int decode_mb_cabac(H264Context *h) {
5439     MpegEncContext * const s = &h->s;
5440     int mb_xy;
5441     int mb_type, partition_count, cbp = 0;
5442     int dct8x8_allowed= h->pps.transform_8x8_mode;
5443
5444     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5445
5446     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5447     if( h->slice_type_nos != FF_I_TYPE ) {
5448         int skip;
5449         /* a skipped mb needs the aff flag from the following mb */
5450         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5451             predict_field_decoding_flag(h);
5452         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5453             skip = h->next_mb_skipped;
5454         else
5455             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5456         /* read skip flags */
5457         if( skip ) {
5458             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5459                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5460                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5461                 if(!h->next_mb_skipped)
5462                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5463             }
5464
5465             decode_mb_skip(h);
5466
5467             h->cbp_table[mb_xy] = 0;
5468             h->chroma_pred_mode_table[mb_xy] = 0;
5469             h->last_qscale_diff = 0;
5470
5471             return 0;
5472
5473         }
5474     }
5475     if(FRAME_MBAFF){
5476         if( (s->mb_y&1) == 0 )
5477             h->mb_mbaff =
5478             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5479     }
5480
5481     h->prev_mb_skipped = 0;
5482
5483     compute_mb_neighbors(h);
5484
5485     if( h->slice_type_nos == FF_B_TYPE ) {
5486         mb_type = decode_cabac_mb_type_b( h );
5487         if( mb_type < 23 ){
5488             partition_count= b_mb_type_info[mb_type].partition_count;
5489             mb_type=         b_mb_type_info[mb_type].type;
5490         }else{
5491             mb_type -= 23;
5492             goto decode_intra_mb;
5493         }
5494     } else if( h->slice_type_nos == FF_P_TYPE ) {
5495         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5496             /* P-type */
5497             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5498                 /* P_L0_D16x16, P_8x8 */
5499                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5500             } else {
5501                 /* P_L0_D8x16, P_L0_D16x8 */
5502                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5503             }
5504             partition_count= p_mb_type_info[mb_type].partition_count;
5505             mb_type=         p_mb_type_info[mb_type].type;
5506         } else {
5507             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5508             goto decode_intra_mb;
5509         }
5510     } else {
5511         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5512         if(h->slice_type == FF_SI_TYPE && mb_type)
5513             mb_type--;
5514         assert(h->slice_type_nos == FF_I_TYPE);
5515 decode_intra_mb:
5516         partition_count = 0;
5517         cbp= i_mb_type_info[mb_type].cbp;
5518         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5519         mb_type= i_mb_type_info[mb_type].type;
5520     }
5521     if(MB_FIELD)
5522         mb_type |= MB_TYPE_INTERLACED;
5523
5524     h->slice_table[ mb_xy ]= h->slice_num;
5525
5526     if(IS_INTRA_PCM(mb_type)) {
5527         const uint8_t *ptr;
5528
5529         // We assume these blocks are very rare so we do not optimize it.
5530         // FIXME The two following lines get the bitstream position in the cabac
5531         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5532         ptr= h->cabac.bytestream;
5533         if(h->cabac.low&0x1) ptr--;
5534         if(CABAC_BITS==16){
5535             if(h->cabac.low&0x1FF) ptr--;
5536         }
5537
5538         // The pixels are stored in the same order as levels in h->mb array.
5539         memcpy(h->mb, ptr, 256); ptr+=256;
5540         if(CHROMA){
5541             memcpy(h->mb+128, ptr, 128); ptr+=128;
5542         }
5543
5544         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5545
5546         // All blocks are present
5547         h->cbp_table[mb_xy] = 0x1ef;
5548         h->chroma_pred_mode_table[mb_xy] = 0;
5549         // In deblocking, the quantizer is 0
5550         s->current_picture.qscale_table[mb_xy]= 0;
5551         // All coeffs are present
5552         memset(h->non_zero_count[mb_xy], 16, 16);
5553         s->current_picture.mb_type[mb_xy]= mb_type;
5554         h->last_qscale_diff = 0;
5555         return 0;
5556     }
5557
5558     if(MB_MBAFF){
5559         h->ref_count[0] <<= 1;
5560         h->ref_count[1] <<= 1;
5561     }
5562
5563     fill_caches(h, mb_type, 0);
5564
5565     if( IS_INTRA( mb_type ) ) {
5566         int i, pred_mode;
5567         if( IS_INTRA4x4( mb_type ) ) {
5568             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5569                 mb_type |= MB_TYPE_8x8DCT;
5570                 for( i = 0; i < 16; i+=4 ) {
5571                     int pred = pred_intra_mode( h, i );
5572                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5573                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5574                 }
5575             } else {
5576                 for( i = 0; i < 16; i++ ) {
5577                     int pred = pred_intra_mode( h, i );
5578                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5579
5580                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5581                 }
5582             }
5583             write_back_intra_pred_mode(h);
5584             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5585         } else {
5586             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5587             if( h->intra16x16_pred_mode < 0 ) return -1;
5588         }
5589         if(CHROMA){
5590             h->chroma_pred_mode_table[mb_xy] =
5591             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5592
5593             pred_mode= check_intra_pred_mode( h, pred_mode );
5594             if( pred_mode < 0 ) return -1;
5595             h->chroma_pred_mode= pred_mode;
5596         }
5597     } else if( partition_count == 4 ) {
5598         int i, j, sub_partition_count[4], list, ref[2][4];
5599
5600         if( h->slice_type_nos == FF_B_TYPE ) {
5601             for( i = 0; i < 4; i++ ) {
5602                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5603                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5604                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5605             }
5606             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5607                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5608                 pred_direct_motion(h, &mb_type);
5609                 h->ref_cache[0][scan8[4]] =
5610                 h->ref_cache[1][scan8[4]] =
5611                 h->ref_cache[0][scan8[12]] =
5612                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5613                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5614                     for( i = 0; i < 4; i++ )
5615                         if( IS_DIRECT(h->sub_mb_type[i]) )
5616                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5617                 }
5618             }
5619         } else {
5620             for( i = 0; i < 4; i++ ) {
5621                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5622                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5623                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5624             }
5625         }
5626
5627         for( list = 0; list < h->list_count; list++ ) {
5628                 for( i = 0; i < 4; i++ ) {
5629                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5630                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5631                         if( h->ref_count[list] > 1 ){
5632                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5633                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5634                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5635                                 return -1;
5636                             }
5637                         }else
5638                             ref[list][i] = 0;
5639                     } else {
5640                         ref[list][i] = -1;
5641                     }
5642                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5643                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5644                 }
5645         }
5646
5647         if(dct8x8_allowed)
5648             dct8x8_allowed = get_dct8x8_allowed(h);
5649
5650         for(list=0; list<h->list_count; list++){
5651             for(i=0; i<4; i++){
5652                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5653                 if(IS_DIRECT(h->sub_mb_type[i])){
5654                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5655                     continue;
5656                 }
5657
5658                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5659                     const int sub_mb_type= h->sub_mb_type[i];
5660                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5661                     for(j=0; j<sub_partition_count[i]; j++){
5662                         int mpx, mpy;
5663                         int mx, my;
5664                         const int index= 4*i + block_width*j;
5665                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5666                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5667                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5668
5669                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5670                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5671                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5672
5673                         if(IS_SUB_8X8(sub_mb_type)){
5674                             mv_cache[ 1 ][0]=
5675                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5676                             mv_cache[ 1 ][1]=
5677                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5678
5679                             mvd_cache[ 1 ][0]=
5680                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5681                             mvd_cache[ 1 ][1]=
5682                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5683                         }else if(IS_SUB_8X4(sub_mb_type)){
5684                             mv_cache[ 1 ][0]= mx;
5685                             mv_cache[ 1 ][1]= my;
5686
5687                             mvd_cache[ 1 ][0]= mx - mpx;
5688                             mvd_cache[ 1 ][1]= my - mpy;
5689                         }else if(IS_SUB_4X8(sub_mb_type)){
5690                             mv_cache[ 8 ][0]= mx;
5691                             mv_cache[ 8 ][1]= my;
5692
5693                             mvd_cache[ 8 ][0]= mx - mpx;
5694                             mvd_cache[ 8 ][1]= my - mpy;
5695                         }
5696                         mv_cache[ 0 ][0]= mx;
5697                         mv_cache[ 0 ][1]= my;
5698
5699                         mvd_cache[ 0 ][0]= mx - mpx;
5700                         mvd_cache[ 0 ][1]= my - mpy;
5701                     }
5702                 }else{
5703                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5704                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5705                     p[0] = p[1] = p[8] = p[9] = 0;
5706                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5707                 }
5708             }
5709         }
5710     } else if( IS_DIRECT(mb_type) ) {
5711         pred_direct_motion(h, &mb_type);
5712         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5713         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5714         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5715     } else {
5716         int list, mx, my, i, mpx, mpy;
5717         if(IS_16X16(mb_type)){
5718             for(list=0; list<h->list_count; list++){
5719                 if(IS_DIR(mb_type, 0, list)){
5720                     int ref;
5721                     if(h->ref_count[list] > 1){
5722                         ref= decode_cabac_mb_ref(h, list, 0);
5723                         if(ref >= (unsigned)h->ref_count[list]){
5724                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5725                             return -1;
5726                         }
5727                     }else
5728                         ref=0;
5729                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5730                 }else
5731                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5732             }
5733             for(list=0; list<h->list_count; list++){
5734                 if(IS_DIR(mb_type, 0, list)){
5735                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5736
5737                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5738                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5739                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5740
5741                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5742                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5743                 }else
5744                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5745             }
5746         }
5747         else if(IS_16X8(mb_type)){
5748             for(list=0; list<h->list_count; list++){
5749                     for(i=0; i<2; i++){
5750                         if(IS_DIR(mb_type, i, list)){
5751                             int ref;
5752                             if(h->ref_count[list] > 1){
5753                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5754                                 if(ref >= (unsigned)h->ref_count[list]){
5755                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5756                                     return -1;
5757                                 }
5758                             }else
5759                                 ref=0;
5760                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5761                         }else
5762                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5763                     }
5764             }
5765             for(list=0; list<h->list_count; list++){
5766                 for(i=0; i<2; i++){
5767                     if(IS_DIR(mb_type, i, list)){
5768                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5769                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5770                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5771                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5772
5773                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5774                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5775                     }else{
5776                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5777                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5778                     }
5779                 }
5780             }
5781         }else{
5782             assert(IS_8X16(mb_type));
5783             for(list=0; list<h->list_count; list++){
5784                     for(i=0; i<2; i++){
5785                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5786                             int ref;
5787                             if(h->ref_count[list] > 1){
5788                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5789                                 if(ref >= (unsigned)h->ref_count[list]){
5790                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5791                                     return -1;
5792                                 }
5793                             }else
5794                                 ref=0;
5795                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5796                         }else
5797                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5798                     }
5799             }
5800             for(list=0; list<h->list_count; list++){
5801                 for(i=0; i<2; i++){
5802                     if(IS_DIR(mb_type, i, list)){
5803                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5804                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5805                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5806
5807                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5808                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5809                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5810                     }else{
5811                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5812                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5813                     }
5814                 }
5815             }
5816         }
5817     }
5818
5819    if( IS_INTER( mb_type ) ) {
5820         h->chroma_pred_mode_table[mb_xy] = 0;
5821         write_back_motion( h, mb_type );
5822    }
5823
5824     if( !IS_INTRA16x16( mb_type ) ) {
5825         cbp  = decode_cabac_mb_cbp_luma( h );
5826         if(CHROMA)
5827             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5828     }
5829
5830     h->cbp_table[mb_xy] = h->cbp = cbp;
5831
5832     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5833         if( decode_cabac_mb_transform_size( h ) )
5834             mb_type |= MB_TYPE_8x8DCT;
5835     }
5836     s->current_picture.mb_type[mb_xy]= mb_type;
5837
5838     if( cbp || IS_INTRA16x16( mb_type ) ) {
5839         const uint8_t *scan, *scan8x8, *dc_scan;
5840         const uint32_t *qmul;
5841         int dqp;
5842
5843         if(IS_INTERLACED(mb_type)){
5844             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5845             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5846             dc_scan= luma_dc_field_scan;
5847         }else{
5848             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5849             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5850             dc_scan= luma_dc_zigzag_scan;
5851         }
5852
5853         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5854         if( dqp == INT_MIN ){
5855             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5856             return -1;
5857         }
5858         s->qscale += dqp;
5859         if(((unsigned)s->qscale) > 51){
5860             if(s->qscale<0) s->qscale+= 52;
5861             else            s->qscale-= 52;
5862         }
5863         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5864         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5865
5866         if( IS_INTRA16x16( mb_type ) ) {
5867             int i;
5868             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5869             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5870
5871             if( cbp&15 ) {
5872                 qmul = h->dequant4_coeff[0][s->qscale];
5873                 for( i = 0; i < 16; i++ ) {
5874                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5875                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5876                 }
5877             } else {
5878                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5879             }
5880         } else {
5881             int i8x8, i4x4;
5882             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5883                 if( cbp & (1<<i8x8) ) {
5884                     if( IS_8x8DCT(mb_type) ) {
5885                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5886                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5887                     } else {
5888                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5889                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5890                             const int index = 4*i8x8 + i4x4;
5891                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5892 //START_TIMER
5893                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5894 //STOP_TIMER("decode_residual")
5895                         }
5896                     }
5897                 } else {
5898                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5899                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5900                 }
5901             }
5902         }
5903
5904         if( cbp&0x30 ){
5905             int c;
5906             for( c = 0; c < 2; c++ ) {
5907                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5908                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5909             }
5910         }
5911
5912         if( cbp&0x20 ) {
5913             int c, i;
5914             for( c = 0; c < 2; c++ ) {
5915                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5916                 for( i = 0; i < 4; i++ ) {
5917                     const int index = 16 + 4 * c + i;
5918                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5919                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5920                 }
5921             }
5922         } else {
5923             uint8_t * const nnz= &h->non_zero_count_cache[0];
5924             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5925             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5926         }
5927     } else {
5928         uint8_t * const nnz= &h->non_zero_count_cache[0];
5929         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5930         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5931         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5932         h->last_qscale_diff = 0;
5933     }
5934
5935     s->current_picture.qscale_table[mb_xy]= s->qscale;
5936     write_back_non_zero_count(h);
5937
5938     if(MB_MBAFF){
5939         h->ref_count[0] >>= 1;
5940         h->ref_count[1] >>= 1;
5941     }
5942
5943     return 0;
5944 }
5945
5946
5947 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5948     const int index_a = qp + h->slice_alpha_c0_offset;
5949     const int alpha = (alpha_table+52)[index_a];
5950     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5951
5952     if( bS[0] < 4 ) {
5953         int8_t tc[4];
5954         tc[0] = (tc0_table+52)[index_a][bS[0]];
5955         tc[1] = (tc0_table+52)[index_a][bS[1]];
5956         tc[2] = (tc0_table+52)[index_a][bS[2]];
5957         tc[3] = (tc0_table+52)[index_a][bS[3]];
5958         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5959     } else {
5960         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5961     }
5962 }
5963 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5964     const int index_a = qp + h->slice_alpha_c0_offset;
5965     const int alpha = (alpha_table+52)[index_a];
5966     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5967
5968     if( bS[0] < 4 ) {
5969         int8_t tc[4];
5970         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5971         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5972         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5973         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5974         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5975     } else {
5976         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5977     }
5978 }
5979
5980 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5981     int i;
5982     for( i = 0; i < 16; i++, pix += stride) {
5983         int index_a;
5984         int alpha;
5985         int beta;
5986
5987         int qp_index;
5988         int bS_index = (i >> 1);
5989         if (!MB_FIELD) {
5990             bS_index &= ~1;
5991             bS_index |= (i & 1);
5992         }
5993
5994         if( bS[bS_index] == 0 ) {
5995             continue;
5996         }
5997
5998         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5999         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6000         alpha = (alpha_table+52)[index_a];
6001         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6002
6003         if( bS[bS_index] < 4 ) {
6004             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6005             const int p0 = pix[-1];
6006             const int p1 = pix[-2];
6007             const int p2 = pix[-3];
6008             const int q0 = pix[0];
6009             const int q1 = pix[1];
6010             const int q2 = pix[2];
6011
6012             if( FFABS( p0 - q0 ) < alpha &&
6013                 FFABS( p1 - p0 ) < beta &&
6014                 FFABS( q1 - q0 ) < beta ) {
6015                 int tc = tc0;
6016                 int i_delta;
6017
6018                 if( FFABS( p2 - p0 ) < beta ) {
6019                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6020                     tc++;
6021                 }
6022                 if( FFABS( q2 - q0 ) < beta ) {
6023                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6024                     tc++;
6025                 }
6026
6027                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6028                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6029                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6030                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6031             }
6032         }else{
6033             const int p0 = pix[-1];
6034             const int p1 = pix[-2];
6035             const int p2 = pix[-3];
6036
6037             const int q0 = pix[0];
6038             const int q1 = pix[1];
6039             const int q2 = pix[2];
6040
6041             if( FFABS( p0 - q0 ) < alpha &&
6042                 FFABS( p1 - p0 ) < beta &&
6043                 FFABS( q1 - q0 ) < beta ) {
6044
6045                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6046                     if( FFABS( p2 - p0 ) < beta)
6047                     {
6048                         const int p3 = pix[-4];
6049                         /* p0', p1', p2' */
6050                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6051                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6052                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6053                     } else {
6054                         /* p0' */
6055                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6056                     }
6057                     if( FFABS( q2 - q0 ) < beta)
6058                     {
6059                         const int q3 = pix[3];
6060                         /* q0', q1', q2' */
6061                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6062                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6063                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6064                     } else {
6065                         /* q0' */
6066                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6067                     }
6068                 }else{
6069                     /* p0', q0' */
6070                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6071                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6072                 }
6073                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6074             }
6075         }
6076     }
6077 }
6078 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6079     int i;
6080     for( i = 0; i < 8; i++, pix += stride) {
6081         int index_a;
6082         int alpha;
6083         int beta;
6084
6085         int qp_index;
6086         int bS_index = i;
6087
6088         if( bS[bS_index] == 0 ) {
6089             continue;
6090         }
6091
6092         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6093         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6094         alpha = (alpha_table+52)[index_a];
6095         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6096
6097         if( bS[bS_index] < 4 ) {
6098             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6099             const int p0 = pix[-1];
6100             const int p1 = pix[-2];
6101             const int q0 = pix[0];
6102             const int q1 = pix[1];
6103
6104             if( FFABS( p0 - q0 ) < alpha &&
6105                 FFABS( p1 - p0 ) < beta &&
6106                 FFABS( q1 - q0 ) < beta ) {
6107                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6108
6109                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6110                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6111                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6112             }
6113         }else{
6114             const int p0 = pix[-1];
6115             const int p1 = pix[-2];
6116             const int q0 = pix[0];
6117             const int q1 = pix[1];
6118
6119             if( FFABS( p0 - q0 ) < alpha &&
6120                 FFABS( p1 - p0 ) < beta &&
6121                 FFABS( q1 - q0 ) < beta ) {
6122
6123                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6124                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6125                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6126             }
6127         }
6128     }
6129 }
6130
6131 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6132     const int index_a = qp + h->slice_alpha_c0_offset;
6133     const int alpha = (alpha_table+52)[index_a];
6134     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6135
6136     if( bS[0] < 4 ) {
6137         int8_t tc[4];
6138         tc[0] = (tc0_table+52)[index_a][bS[0]];
6139         tc[1] = (tc0_table+52)[index_a][bS[1]];
6140         tc[2] = (tc0_table+52)[index_a][bS[2]];
6141         tc[3] = (tc0_table+52)[index_a][bS[3]];
6142         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6143     } else {
6144         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6145     }
6146 }
6147
6148 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6149     const int index_a = qp + h->slice_alpha_c0_offset;
6150     const int alpha = (alpha_table+52)[index_a];
6151     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6152
6153     if( bS[0] < 4 ) {
6154         int8_t tc[4];
6155         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6156         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6157         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6158         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6159         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6160     } else {
6161         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6162     }
6163 }
6164
6165 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6166     MpegEncContext * const s = &h->s;
6167     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6168     int mb_xy, mb_type;
6169     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6170
6171     mb_xy = h->mb_xy;
6172
6173     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6174         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6175        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6176                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6177         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6178         return;
6179     }
6180     assert(!FRAME_MBAFF);
6181
6182     mb_type = s->current_picture.mb_type[mb_xy];
6183     qp = s->current_picture.qscale_table[mb_xy];
6184     qp0 = s->current_picture.qscale_table[mb_xy-1];
6185     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6186     qpc = get_chroma_qp( h, 0, qp );
6187     qpc0 = get_chroma_qp( h, 0, qp0 );
6188     qpc1 = get_chroma_qp( h, 0, qp1 );
6189     qp0 = (qp + qp0 + 1) >> 1;
6190     qp1 = (qp + qp1 + 1) >> 1;
6191     qpc0 = (qpc + qpc0 + 1) >> 1;
6192     qpc1 = (qpc + qpc1 + 1) >> 1;
6193     qp_thresh = 15 - h->slice_alpha_c0_offset;
6194     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6195        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6196         return;
6197
6198     if( IS_INTRA(mb_type) ) {
6199         int16_t bS4[4] = {4,4,4,4};
6200         int16_t bS3[4] = {3,3,3,3};
6201         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6202         if( IS_8x8DCT(mb_type) ) {
6203             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6204             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6205             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6206             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6207         } else {
6208             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6209             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6210             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6211             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6212             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6213             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6214             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6215             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6216         }
6217         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6218         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6219         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6220         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6221         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6222         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6223         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6224         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6225         return;
6226     } else {
6227         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6228         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6229         int edges;
6230         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6231             edges = 4;
6232             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6233         } else {
6234             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6235                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6236             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6237                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6238                              ? 3 : 0;
6239             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6240             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6241             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6242                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6243         }
6244         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6245             bSv[0][0] = 0x0004000400040004ULL;
6246         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6247             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6248
6249 #define FILTER(hv,dir,edge)\
6250         if(bSv[dir][edge]) {\
6251             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6252             if(!(edge&1)) {\
6253                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6254                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6255             }\
6256         }
6257         if( edges == 1 ) {
6258             FILTER(v,0,0);
6259             FILTER(h,1,0);
6260         } else if( IS_8x8DCT(mb_type) ) {
6261             FILTER(v,0,0);
6262             FILTER(v,0,2);
6263             FILTER(h,1,0);
6264             FILTER(h,1,2);
6265         } else {
6266             FILTER(v,0,0);
6267             FILTER(v,0,1);
6268             FILTER(v,0,2);
6269             FILTER(v,0,3);
6270             FILTER(h,1,0);
6271             FILTER(h,1,1);
6272             FILTER(h,1,2);
6273             FILTER(h,1,3);
6274         }
6275 #undef FILTER
6276     }
6277 }
6278
6279
6280 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6281     MpegEncContext * const s = &h->s;
6282     int edge;
6283     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6284     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6285     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6286     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6287     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6288
6289     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6290                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6291     // how often to recheck mv-based bS when iterating between edges
6292     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6293                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6294     // how often to recheck mv-based bS when iterating along each edge
6295     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6296
6297     if (first_vertical_edge_done) {
6298         start = 1;
6299     }
6300
6301     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6302         start = 1;
6303
6304     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6305         && !IS_INTERLACED(mb_type)
6306         && IS_INTERLACED(mbm_type)
6307         ) {
6308         // This is a special case in the norm where the filtering must
6309         // be done twice (one each of the field) even if we are in a
6310         // frame macroblock.
6311         //
6312         static const int nnz_idx[4] = {4,5,6,3};
6313         unsigned int tmp_linesize   = 2 *   linesize;
6314         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6315         int mbn_xy = mb_xy - 2 * s->mb_stride;
6316         int qp;
6317         int i, j;
6318         int16_t bS[4];
6319
6320         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6321             if( IS_INTRA(mb_type) ||
6322                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6323                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6324             } else {
6325                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6326                 for( i = 0; i < 4; i++ ) {
6327                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6328                         mbn_nnz[nnz_idx[i]] != 0 )
6329                         bS[i] = 2;
6330                     else
6331                         bS[i] = 1;
6332                 }
6333             }
6334             // Do not use s->qscale as luma quantizer because it has not the same
6335             // value in IPCM macroblocks.
6336             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6337             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6338             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6339             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6340             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6341                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6342             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6343                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6344         }
6345
6346         start = 1;
6347     }
6348
6349     /* Calculate bS */
6350     for( edge = start; edge < edges; edge++ ) {
6351         /* mbn_xy: neighbor macroblock */
6352         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6353         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6354         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6355         int16_t bS[4];
6356         int qp;
6357
6358         if( (edge&1) && IS_8x8DCT(mb_type) )
6359             continue;
6360
6361         if( IS_INTRA(mb_type) ||
6362             IS_INTRA(mbn_type) ) {
6363             int value;
6364             if (edge == 0) {
6365                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6366                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6367                 ) {
6368                     value = 4;
6369                 } else {
6370                     value = 3;
6371                 }
6372             } else {
6373                 value = 3;
6374             }
6375             bS[0] = bS[1] = bS[2] = bS[3] = value;
6376         } else {
6377             int i, l;
6378             int mv_done;
6379
6380             if( edge & mask_edge ) {
6381                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6382                 mv_done = 1;
6383             }
6384             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6385                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6386                 mv_done = 1;
6387             }
6388             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6389                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6390                 int bn_idx= b_idx - (dir ? 8:1);
6391                 int v = 0;
6392
6393                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6394                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6395                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6396                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6397                 }
6398
6399                 if(h->slice_type_nos == FF_B_TYPE && v){
6400                     v=0;
6401                     for( l = 0; !v && l < 2; l++ ) {
6402                         int ln= 1-l;
6403                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6404                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6405                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6406                     }
6407                 }
6408
6409                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6410                 mv_done = 1;
6411             }
6412             else
6413                 mv_done = 0;
6414
6415             for( i = 0; i < 4; i++ ) {
6416                 int x = dir == 0 ? edge : i;
6417                 int y = dir == 0 ? i    : edge;
6418                 int b_idx= 8 + 4 + x + 8*y;
6419                 int bn_idx= b_idx - (dir ? 8:1);
6420
6421                 if( h->non_zero_count_cache[b_idx] |
6422                     h->non_zero_count_cache[bn_idx] ) {
6423                     bS[i] = 2;
6424                 }
6425                 else if(!mv_done)
6426                 {
6427                     bS[i] = 0;
6428                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6429                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6430                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6431                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6432                             bS[i] = 1;
6433                             break;
6434                         }
6435                     }
6436
6437                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6438                         bS[i] = 0;
6439                         for( l = 0; l < 2; l++ ) {
6440                             int ln= 1-l;
6441                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6442                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6443                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6444                                 bS[i] = 1;
6445                                 break;
6446                             }
6447                         }
6448                     }
6449                 }
6450             }
6451
6452             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6453                 continue;
6454         }
6455
6456         /* Filter edge */
6457         // Do not use s->qscale as luma quantizer because it has not the same
6458         // value in IPCM macroblocks.
6459         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6460         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6461         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6462         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6463         if( dir == 0 ) {
6464             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6465             if( (edge&1) == 0 ) {
6466                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6467                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6468                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6469                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6470             }
6471         } else {
6472             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6473             if( (edge&1) == 0 ) {
6474                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6475                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6476                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6477                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6478             }
6479         }
6480     }
6481 }
6482
6483 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6484     MpegEncContext * const s = &h->s;
6485     const int mb_xy= mb_x + mb_y*s->mb_stride;
6486     const int mb_type = s->current_picture.mb_type[mb_xy];
6487     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6488     int first_vertical_edge_done = 0;
6489     av_unused int dir;
6490
6491     //for sufficiently low qp, filtering wouldn't do anything
6492     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6493     if(!FRAME_MBAFF){
6494         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6495         int qp = s->current_picture.qscale_table[mb_xy];
6496         if(qp <= qp_thresh
6497            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6498            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6499             return;
6500         }
6501     }
6502
6503     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6504     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6505         int top_type, left_type[2];
6506         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6507         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6508         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6509
6510         if(IS_8x8DCT(top_type)){
6511             h->non_zero_count_cache[4+8*0]=
6512             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6513             h->non_zero_count_cache[6+8*0]=
6514             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6515         }
6516         if(IS_8x8DCT(left_type[0])){
6517             h->non_zero_count_cache[3+8*1]=
6518             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6519         }
6520         if(IS_8x8DCT(left_type[1])){
6521             h->non_zero_count_cache[3+8*3]=
6522             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6523         }
6524
6525         if(IS_8x8DCT(mb_type)){
6526             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6527             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6528
6529             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6530             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6531
6532             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6533             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6534
6535             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6536             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6537         }
6538     }
6539
6540     if (FRAME_MBAFF
6541             // left mb is in picture
6542             && h->slice_table[mb_xy-1] != 0xFFFF
6543             // and current and left pair do not have the same interlaced type
6544             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6545             // and left mb is in the same slice if deblocking_filter == 2
6546             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6547         /* First vertical edge is different in MBAFF frames
6548          * There are 8 different bS to compute and 2 different Qp
6549          */
6550         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6551         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6552         int16_t bS[8];
6553         int qp[2];
6554         int bqp[2];
6555         int rqp[2];
6556         int mb_qp, mbn0_qp, mbn1_qp;
6557         int i;
6558         first_vertical_edge_done = 1;
6559
6560         if( IS_INTRA(mb_type) )
6561             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6562         else {
6563             for( i = 0; i < 8; i++ ) {
6564                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6565
6566                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6567                     bS[i] = 4;
6568                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6569                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6570                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6571                                                                        :
6572                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6573                     bS[i] = 2;
6574                 else
6575                     bS[i] = 1;
6576             }
6577         }
6578
6579         mb_qp = s->current_picture.qscale_table[mb_xy];
6580         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6581         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6582         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6583         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6584                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6585         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6586                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6587         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6588         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6589                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6590         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6591                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6592
6593         /* Filter edge */
6594         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6595         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6596         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6597         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6598         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6599     }
6600
6601 #if CONFIG_SMALL
6602     for( dir = 0; dir < 2; dir++ )
6603         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6604 #else
6605     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6606     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6607 #endif
6608 }
6609
6610 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6611     H264Context *h = *(void**)arg;
6612     MpegEncContext * const s = &h->s;
6613     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6614
6615     s->mb_skip_run= -1;
6616
6617     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6618                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6619
6620     if( h->pps.cabac ) {
6621         int i;
6622
6623         /* realign */
6624         align_get_bits( &s->gb );
6625
6626         /* init cabac */
6627         ff_init_cabac_states( &h->cabac);
6628         ff_init_cabac_decoder( &h->cabac,
6629                                s->gb.buffer + get_bits_count(&s->gb)/8,
6630                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6631         /* calculate pre-state */
6632         for( i= 0; i < 460; i++ ) {
6633             int pre;
6634             if( h->slice_type_nos == FF_I_TYPE )
6635                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6636             else
6637                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6638
6639             if( pre <= 63 )
6640                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6641             else
6642                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6643         }
6644
6645         for(;;){
6646 //START_TIMER
6647             int ret = decode_mb_cabac(h);
6648             int eos;
6649 //STOP_TIMER("decode_mb_cabac")
6650
6651             if(ret>=0) hl_decode_mb(h);
6652
6653             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6654                 s->mb_y++;
6655
6656                 ret = decode_mb_cabac(h);
6657
6658                 if(ret>=0) hl_decode_mb(h);
6659                 s->mb_y--;
6660             }
6661             eos = get_cabac_terminate( &h->cabac );
6662
6663             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6664                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6665                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6666                 return -1;
6667             }
6668
6669             if( ++s->mb_x >= s->mb_width ) {
6670                 s->mb_x = 0;
6671                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6672                 ++s->mb_y;
6673                 if(FIELD_OR_MBAFF_PICTURE) {
6674                     ++s->mb_y;
6675                 }
6676             }
6677
6678             if( eos || s->mb_y >= s->mb_height ) {
6679                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6680                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6681                 return 0;
6682             }
6683         }
6684
6685     } else {
6686         for(;;){
6687             int ret = decode_mb_cavlc(h);
6688
6689             if(ret>=0) hl_decode_mb(h);
6690
6691             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6692                 s->mb_y++;
6693                 ret = decode_mb_cavlc(h);
6694
6695                 if(ret>=0) hl_decode_mb(h);
6696                 s->mb_y--;
6697             }
6698
6699             if(ret<0){
6700                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6701                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6702
6703                 return -1;
6704             }
6705
6706             if(++s->mb_x >= s->mb_width){
6707                 s->mb_x=0;
6708                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6709                 ++s->mb_y;
6710                 if(FIELD_OR_MBAFF_PICTURE) {
6711                     ++s->mb_y;
6712                 }
6713                 if(s->mb_y >= s->mb_height){
6714                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6715
6716                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6717                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6718
6719                         return 0;
6720                     }else{
6721                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6722
6723                         return -1;
6724                     }
6725                 }
6726             }
6727
6728             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6729                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6730                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6731                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6732
6733                     return 0;
6734                 }else{
6735                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6736
6737                     return -1;
6738                 }
6739             }
6740         }
6741     }
6742
6743 #if 0
6744     for(;s->mb_y < s->mb_height; s->mb_y++){
6745         for(;s->mb_x < s->mb_width; s->mb_x++){
6746             int ret= decode_mb(h);
6747
6748             hl_decode_mb(h);
6749
6750             if(ret<0){
6751                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6752                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6753
6754                 return -1;
6755             }
6756
6757             if(++s->mb_x >= s->mb_width){
6758                 s->mb_x=0;
6759                 if(++s->mb_y >= s->mb_height){
6760                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6761                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6762
6763                         return 0;
6764                     }else{
6765                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6766
6767                         return -1;
6768                     }
6769                 }
6770             }
6771
6772             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6773                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6774                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6775
6776                     return 0;
6777                 }else{
6778                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6779
6780                     return -1;
6781                 }
6782             }
6783         }
6784         s->mb_x=0;
6785         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6786     }
6787 #endif
6788     return -1; //not reached
6789 }
6790
6791 static int decode_picture_timing(H264Context *h){
6792     MpegEncContext * const s = &h->s;
6793     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6794         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6795         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6796     }
6797     if(h->sps.pic_struct_present_flag){
6798         unsigned int i, num_clock_ts;
6799         h->sei_pic_struct = get_bits(&s->gb, 4);
6800         h->sei_ct_type    = 0;
6801
6802         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6803             return -1;
6804
6805         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6806
6807         for (i = 0 ; i < num_clock_ts ; i++){
6808             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6809                 unsigned int full_timestamp_flag;
6810                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6811                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6812                 skip_bits(&s->gb, 5);                 /* counting_type */
6813                 full_timestamp_flag = get_bits(&s->gb, 1);
6814                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6815                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6816                 skip_bits(&s->gb, 8);                 /* n_frames */
6817                 if(full_timestamp_flag){
6818                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6819                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6820                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6821                 }else{
6822                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6823                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6824                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6825                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6826                             if(get_bits(&s->gb, 1))   /* hours_flag */
6827                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6828                         }
6829                     }
6830                 }
6831                 if(h->sps.time_offset_length > 0)
6832                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6833             }
6834         }
6835     }
6836     return 0;
6837 }
6838
6839 static int decode_unregistered_user_data(H264Context *h, int size){
6840     MpegEncContext * const s = &h->s;
6841     uint8_t user_data[16+256];
6842     int e, build, i;
6843
6844     if(size<16)
6845         return -1;
6846
6847     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6848         user_data[i]= get_bits(&s->gb, 8);
6849     }
6850
6851     user_data[i]= 0;
6852     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6853     if(e==1 && build>=0)
6854         h->x264_build= build;
6855
6856     if(s->avctx->debug & FF_DEBUG_BUGS)
6857         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6858
6859     for(; i<size; i++)
6860         skip_bits(&s->gb, 8);
6861
6862     return 0;
6863 }
6864
6865 static int decode_recovery_point(H264Context *h){
6866     MpegEncContext * const s = &h->s;
6867
6868     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6869     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6870
6871     return 0;
6872 }
6873
6874 static int decode_buffering_period(H264Context *h){
6875     MpegEncContext * const s = &h->s;
6876     unsigned int sps_id;
6877     int sched_sel_idx;
6878     SPS *sps;
6879
6880     sps_id = get_ue_golomb_31(&s->gb);
6881     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6882         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6883         return -1;
6884     }
6885     sps = h->sps_buffers[sps_id];
6886
6887     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6888     if (sps->nal_hrd_parameters_present_flag) {
6889         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6890             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6891             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6892         }
6893     }
6894     if (sps->vcl_hrd_parameters_present_flag) {
6895         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6896             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6897             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6898         }
6899     }
6900
6901     h->sei_buffering_period_present = 1;
6902     return 0;
6903 }
6904
6905 int ff_h264_decode_sei(H264Context *h){
6906     MpegEncContext * const s = &h->s;
6907
6908     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6909         int size, type;
6910
6911         type=0;
6912         do{
6913             type+= show_bits(&s->gb, 8);
6914         }while(get_bits(&s->gb, 8) == 255);
6915
6916         size=0;
6917         do{
6918             size+= show_bits(&s->gb, 8);
6919         }while(get_bits(&s->gb, 8) == 255);
6920
6921         switch(type){
6922         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6923             if(decode_picture_timing(h) < 0)
6924                 return -1;
6925             break;
6926         case SEI_TYPE_USER_DATA_UNREGISTERED:
6927             if(decode_unregistered_user_data(h, size) < 0)
6928                 return -1;
6929             break;
6930         case SEI_TYPE_RECOVERY_POINT:
6931             if(decode_recovery_point(h) < 0)
6932                 return -1;
6933             break;
6934         case SEI_BUFFERING_PERIOD:
6935             if(decode_buffering_period(h) < 0)
6936                 return -1;
6937             break;
6938         default:
6939             skip_bits(&s->gb, 8*size);
6940         }
6941
6942         //FIXME check bits here
6943         align_get_bits(&s->gb);
6944     }
6945
6946     return 0;
6947 }
6948
6949 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6950     MpegEncContext * const s = &h->s;
6951     int cpb_count, i;
6952     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6953
6954     if(cpb_count > 32U){
6955         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6956         return -1;
6957     }
6958
6959     get_bits(&s->gb, 4); /* bit_rate_scale */
6960     get_bits(&s->gb, 4); /* cpb_size_scale */
6961     for(i=0; i<cpb_count; i++){
6962         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6963         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6964         get_bits1(&s->gb);     /* cbr_flag */
6965     }
6966     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6967     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6968     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6969     sps->time_offset_length = get_bits(&s->gb, 5);
6970     sps->cpb_cnt = cpb_count;
6971     return 0;
6972 }
6973
6974 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6975     MpegEncContext * const s = &h->s;
6976     int aspect_ratio_info_present_flag;
6977     unsigned int aspect_ratio_idc;
6978
6979     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6980
6981     if( aspect_ratio_info_present_flag ) {
6982         aspect_ratio_idc= get_bits(&s->gb, 8);
6983         if( aspect_ratio_idc == EXTENDED_SAR ) {
6984             sps->sar.num= get_bits(&s->gb, 16);
6985             sps->sar.den= get_bits(&s->gb, 16);
6986         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6987             sps->sar=  pixel_aspect[aspect_ratio_idc];
6988         }else{
6989             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6990             return -1;
6991         }
6992     }else{
6993         sps->sar.num=
6994         sps->sar.den= 0;
6995     }
6996 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6997
6998     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6999         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7000     }
7001
7002     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7003         get_bits(&s->gb, 3);    /* video_format */
7004         get_bits1(&s->gb);      /* video_full_range_flag */
7005         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7006             get_bits(&s->gb, 8); /* colour_primaries */
7007             get_bits(&s->gb, 8); /* transfer_characteristics */
7008             get_bits(&s->gb, 8); /* matrix_coefficients */
7009         }
7010     }
7011
7012     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7013         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7014         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7015     }
7016
7017     sps->timing_info_present_flag = get_bits1(&s->gb);
7018     if(sps->timing_info_present_flag){
7019         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7020         sps->time_scale = get_bits_long(&s->gb, 32);
7021         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7022     }
7023
7024     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7025     if(sps->nal_hrd_parameters_present_flag)
7026         if(decode_hrd_parameters(h, sps) < 0)
7027             return -1;
7028     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7029     if(sps->vcl_hrd_parameters_present_flag)
7030         if(decode_hrd_parameters(h, sps) < 0)
7031             return -1;
7032     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7033         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7034     sps->pic_struct_present_flag = get_bits1(&s->gb);
7035
7036     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7037     if(sps->bitstream_restriction_flag){
7038         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7039         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7040         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7041         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7042         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7043         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7044         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7045
7046         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7047             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7048             return -1;
7049         }
7050     }
7051
7052     return 0;
7053 }
7054
7055 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7056                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7057     MpegEncContext * const s = &h->s;
7058     int i, last = 8, next = 8;
7059     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7060     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7061         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7062     else
7063     for(i=0;i<size;i++){
7064         if(next)
7065             next = (last + get_se_golomb(&s->gb)) & 0xff;
7066         if(!i && !next){ /* matrix not written, we use the preset one */
7067             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7068             break;
7069         }
7070         last = factors[scan[i]] = next ? next : last;
7071     }
7072 }
7073
7074 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7075                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7076     MpegEncContext * const s = &h->s;
7077     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7078     const uint8_t *fallback[4] = {
7079         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7080         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7081         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7082         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7083     };
7084     if(get_bits1(&s->gb)){
7085         sps->scaling_matrix_present |= is_sps;
7086         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7087         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7088         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7089         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7090         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7091         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7092         if(is_sps || pps->transform_8x8_mode){
7093             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7094             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7095         }
7096     }
7097 }
7098
7099 int ff_h264_decode_seq_parameter_set(H264Context *h){
7100     MpegEncContext * const s = &h->s;
7101     int profile_idc, level_idc;
7102     unsigned int sps_id;
7103     int i;
7104     SPS *sps;
7105
7106     profile_idc= get_bits(&s->gb, 8);
7107     get_bits1(&s->gb);   //constraint_set0_flag
7108     get_bits1(&s->gb);   //constraint_set1_flag
7109     get_bits1(&s->gb);   //constraint_set2_flag
7110     get_bits1(&s->gb);   //constraint_set3_flag
7111     get_bits(&s->gb, 4); // reserved
7112     level_idc= get_bits(&s->gb, 8);
7113     sps_id= get_ue_golomb_31(&s->gb);
7114
7115     if(sps_id >= MAX_SPS_COUNT) {
7116         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7117         return -1;
7118     }
7119     sps= av_mallocz(sizeof(SPS));
7120     if(sps == NULL)
7121         return -1;
7122
7123     sps->profile_idc= profile_idc;
7124     sps->level_idc= level_idc;
7125
7126     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7127     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7128     sps->scaling_matrix_present = 0;
7129
7130     if(sps->profile_idc >= 100){ //high profile
7131         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7132         if(sps->chroma_format_idc == 3)
7133             sps->residual_color_transform_flag = get_bits1(&s->gb);
7134         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7135         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7136         sps->transform_bypass = get_bits1(&s->gb);
7137         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7138     }else{
7139         sps->chroma_format_idc= 1;
7140     }
7141
7142     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7143     sps->poc_type= get_ue_golomb_31(&s->gb);
7144
7145     if(sps->poc_type == 0){ //FIXME #define
7146         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7147     } else if(sps->poc_type == 1){//FIXME #define
7148         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7149         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7150         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7151         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7152
7153         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7154             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7155             goto fail;
7156         }
7157
7158         for(i=0; i<sps->poc_cycle_length; i++)
7159             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7160     }else if(sps->poc_type != 2){
7161         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7162         goto fail;
7163     }
7164
7165     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7166     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7167         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7168         goto fail;
7169     }
7170     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7171     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7172     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7173     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7174        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7175         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7176         goto fail;
7177     }
7178
7179     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7180     if(!sps->frame_mbs_only_flag)
7181         sps->mb_aff= get_bits1(&s->gb);
7182     else
7183         sps->mb_aff= 0;
7184
7185     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7186
7187 #ifndef ALLOW_INTERLACE
7188     if(sps->mb_aff)
7189         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7190 #endif
7191     sps->crop= get_bits1(&s->gb);
7192     if(sps->crop){
7193         sps->crop_left  = get_ue_golomb(&s->gb);
7194         sps->crop_right = get_ue_golomb(&s->gb);
7195         sps->crop_top   = get_ue_golomb(&s->gb);
7196         sps->crop_bottom= get_ue_golomb(&s->gb);
7197         if(sps->crop_left || sps->crop_top){
7198             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7199         }
7200         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7201             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7202         }
7203     }else{
7204         sps->crop_left  =
7205         sps->crop_right =
7206         sps->crop_top   =
7207         sps->crop_bottom= 0;
7208     }
7209
7210     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7211     if( sps->vui_parameters_present_flag )
7212         decode_vui_parameters(h, sps);
7213
7214     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7215         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7216                sps_id, sps->profile_idc, sps->level_idc,
7217                sps->poc_type,
7218                sps->ref_frame_count,
7219                sps->mb_width, sps->mb_height,
7220                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7221                sps->direct_8x8_inference_flag ? "8B8" : "",
7222                sps->crop_left, sps->crop_right,
7223                sps->crop_top, sps->crop_bottom,
7224                sps->vui_parameters_present_flag ? "VUI" : "",
7225                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7226                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7227                sps->timing_info_present_flag ? sps->time_scale : 0
7228                );
7229     }
7230
7231     av_free(h->sps_buffers[sps_id]);
7232     h->sps_buffers[sps_id]= sps;
7233     h->sps = *sps;
7234     return 0;
7235 fail:
7236     av_free(sps);
7237     return -1;
7238 }
7239
7240 static void
7241 build_qp_table(PPS *pps, int t, int index)
7242 {
7243     int i;
7244     for(i = 0; i < 52; i++)
7245         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7246 }
7247
7248 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7249     MpegEncContext * const s = &h->s;
7250     unsigned int pps_id= get_ue_golomb(&s->gb);
7251     PPS *pps;
7252
7253     if(pps_id >= MAX_PPS_COUNT) {
7254         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7255         return -1;
7256     }
7257
7258     pps= av_mallocz(sizeof(PPS));
7259     if(pps == NULL)
7260         return -1;
7261     pps->sps_id= get_ue_golomb_31(&s->gb);
7262     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7263         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7264         goto fail;
7265     }
7266
7267     pps->cabac= get_bits1(&s->gb);
7268     pps->pic_order_present= get_bits1(&s->gb);
7269     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7270     if(pps->slice_group_count > 1 ){
7271         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7272         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7273         switch(pps->mb_slice_group_map_type){
7274         case 0:
7275 #if 0
7276 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7277 |    run_length[ i ]                                |1  |ue(v)   |
7278 #endif
7279             break;
7280         case 2:
7281 #if 0
7282 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7283 |{                                                  |   |        |
7284 |    top_left_mb[ i ]                               |1  |ue(v)   |
7285 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7286 |   }                                               |   |        |
7287 #endif
7288             break;
7289         case 3:
7290         case 4:
7291         case 5:
7292 #if 0
7293 |   slice_group_change_direction_flag               |1  |u(1)    |
7294 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7295 #endif
7296             break;
7297         case 6:
7298 #if 0
7299 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7300 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7301 |)                                                  |   |        |
7302 |    slice_group_id[ i ]                            |1  |u(v)    |
7303 #endif
7304             break;
7305         }
7306     }
7307     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7308     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7309     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7310         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7311         goto fail;
7312     }
7313
7314     pps->weighted_pred= get_bits1(&s->gb);
7315     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7316     pps->init_qp= get_se_golomb(&s->gb) + 26;
7317     pps->init_qs= get_se_golomb(&s->gb) + 26;
7318     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7319     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7320     pps->constrained_intra_pred= get_bits1(&s->gb);
7321     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7322
7323     pps->transform_8x8_mode= 0;
7324     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7325     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7326     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7327
7328     if(get_bits_count(&s->gb) < bit_length){
7329         pps->transform_8x8_mode= get_bits1(&s->gb);
7330         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7331         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7332     } else {
7333         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7334     }
7335
7336     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7337     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7338     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7339         h->pps.chroma_qp_diff= 1;
7340
7341     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7342         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7343                pps_id, pps->sps_id,
7344                pps->cabac ? "CABAC" : "CAVLC",
7345                pps->slice_group_count,
7346                pps->ref_count[0], pps->ref_count[1],
7347                pps->weighted_pred ? "weighted" : "",
7348                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7349                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7350                pps->constrained_intra_pred ? "CONSTR" : "",
7351                pps->redundant_pic_cnt_present ? "REDU" : "",
7352                pps->transform_8x8_mode ? "8x8DCT" : ""
7353                );
7354     }
7355
7356     av_free(h->pps_buffers[pps_id]);
7357     h->pps_buffers[pps_id]= pps;
7358     return 0;
7359 fail:
7360     av_free(pps);
7361     return -1;
7362 }
7363
7364 /**
7365  * Call decode_slice() for each context.
7366  *
7367  * @param h h264 master context
7368  * @param context_count number of contexts to execute
7369  */
7370 static void execute_decode_slices(H264Context *h, int context_count){
7371     MpegEncContext * const s = &h->s;
7372     AVCodecContext * const avctx= s->avctx;
7373     H264Context *hx;
7374     int i;
7375
7376     if (s->avctx->hwaccel)
7377         return;
7378     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7379         return;
7380     if(context_count == 1) {
7381         decode_slice(avctx, &h);
7382     } else {
7383         for(i = 1; i < context_count; i++) {
7384             hx = h->thread_context[i];
7385             hx->s.error_recognition = avctx->error_recognition;
7386             hx->s.error_count = 0;
7387         }
7388
7389         avctx->execute(avctx, (void *)decode_slice,
7390                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7391
7392         /* pull back stuff from slices to master context */
7393         hx = h->thread_context[context_count - 1];
7394         s->mb_x = hx->s.mb_x;
7395         s->mb_y = hx->s.mb_y;
7396         s->dropable = hx->s.dropable;
7397         s->picture_structure = hx->s.picture_structure;
7398         for(i = 1; i < context_count; i++)
7399             h->s.error_count += h->thread_context[i]->s.error_count;
7400     }
7401 }
7402
7403
7404 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7405     MpegEncContext * const s = &h->s;
7406     AVCodecContext * const avctx= s->avctx;
7407     int buf_index=0;
7408     H264Context *hx; ///< thread context
7409     int context_count = 0;
7410
7411     h->max_contexts = avctx->thread_count;
7412 #if 0
7413     int i;
7414     for(i=0; i<50; i++){
7415         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7416     }
7417 #endif
7418     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7419         h->current_slice = 0;
7420         if (!s->first_field)
7421             s->current_picture_ptr= NULL;
7422         reset_sei(h);
7423     }
7424
7425     for(;;){
7426         int consumed;
7427         int dst_length;
7428         int bit_length;
7429         const uint8_t *ptr;
7430         int i, nalsize = 0;
7431         int err;
7432
7433         if(h->is_avc) {
7434             if(buf_index >= buf_size) break;
7435             nalsize = 0;
7436             for(i = 0; i < h->nal_length_size; i++)
7437                 nalsize = (nalsize << 8) | buf[buf_index++];
7438             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7439                 if(nalsize == 1){
7440                     buf_index++;
7441                     continue;
7442                 }else{
7443                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7444                     break;
7445                 }
7446             }
7447         } else {
7448             // start code prefix search
7449             for(; buf_index + 3 < buf_size; buf_index++){
7450                 // This should always succeed in the first iteration.
7451                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7452                     break;
7453             }
7454
7455             if(buf_index+3 >= buf_size) break;
7456
7457             buf_index+=3;
7458         }
7459
7460         hx = h->thread_context[context_count];
7461
7462         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7463         if (ptr==NULL || dst_length < 0){
7464             return -1;
7465         }
7466         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7467             dst_length--;
7468         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7469
7470         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7471             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7472         }
7473
7474         if (h->is_avc && (nalsize != consumed)){
7475             int i, debug_level = AV_LOG_DEBUG;
7476             for (i = consumed; i < nalsize; i++)
7477                 if (buf[buf_index+i])
7478                     debug_level = AV_LOG_ERROR;
7479             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7480             consumed= nalsize;
7481         }
7482
7483         buf_index += consumed;
7484
7485         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7486            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7487             continue;
7488
7489       again:
7490         err = 0;
7491         switch(hx->nal_unit_type){
7492         case NAL_IDR_SLICE:
7493             if (h->nal_unit_type != NAL_IDR_SLICE) {
7494                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7495                 return -1;
7496             }
7497             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7498         case NAL_SLICE:
7499             init_get_bits(&hx->s.gb, ptr, bit_length);
7500             hx->intra_gb_ptr=
7501             hx->inter_gb_ptr= &hx->s.gb;
7502             hx->s.data_partitioning = 0;
7503
7504             if((err = decode_slice_header(hx, h)))
7505                break;
7506
7507             if (s->avctx->hwaccel && h->current_slice == 1) {
7508                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7509                     return -1;
7510             }
7511
7512             s->current_picture_ptr->key_frame |=
7513                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7514                     (h->sei_recovery_frame_cnt >= 0);
7515             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7516                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7517                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7518                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7519                && avctx->skip_frame < AVDISCARD_ALL){
7520                 if(avctx->hwaccel) {
7521                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7522                         return -1;
7523                 }else
7524                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7525                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7526                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7527                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7528                 }else
7529                     context_count++;
7530             }
7531             break;
7532         case NAL_DPA:
7533             init_get_bits(&hx->s.gb, ptr, bit_length);
7534             hx->intra_gb_ptr=
7535             hx->inter_gb_ptr= NULL;
7536             hx->s.data_partitioning = 1;
7537
7538             err = decode_slice_header(hx, h);
7539             break;
7540         case NAL_DPB:
7541             init_get_bits(&hx->intra_gb, ptr, bit_length);
7542             hx->intra_gb_ptr= &hx->intra_gb;
7543             break;
7544         case NAL_DPC:
7545             init_get_bits(&hx->inter_gb, ptr, bit_length);
7546             hx->inter_gb_ptr= &hx->inter_gb;
7547
7548             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7549                && s->context_initialized
7550                && s->hurry_up < 5
7551                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7552                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7553                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7554                && avctx->skip_frame < AVDISCARD_ALL)
7555                 context_count++;
7556             break;
7557         case NAL_SEI:
7558             init_get_bits(&s->gb, ptr, bit_length);
7559             ff_h264_decode_sei(h);
7560             break;
7561         case NAL_SPS:
7562             init_get_bits(&s->gb, ptr, bit_length);
7563             ff_h264_decode_seq_parameter_set(h);
7564
7565             if(s->flags& CODEC_FLAG_LOW_DELAY)
7566                 s->low_delay=1;
7567
7568             if(avctx->has_b_frames < 2)
7569                 avctx->has_b_frames= !s->low_delay;
7570             break;
7571         case NAL_PPS:
7572             init_get_bits(&s->gb, ptr, bit_length);
7573
7574             ff_h264_decode_picture_parameter_set(h, bit_length);
7575
7576             break;
7577         case NAL_AUD:
7578         case NAL_END_SEQUENCE:
7579         case NAL_END_STREAM:
7580         case NAL_FILLER_DATA:
7581         case NAL_SPS_EXT:
7582         case NAL_AUXILIARY_SLICE:
7583             break;
7584         default:
7585             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7586         }
7587
7588         if(context_count == h->max_contexts) {
7589             execute_decode_slices(h, context_count);
7590             context_count = 0;
7591         }
7592
7593         if (err < 0)
7594             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7595         else if(err == 1) {
7596             /* Slice could not be decoded in parallel mode, copy down
7597              * NAL unit stuff to context 0 and restart. Note that
7598              * rbsp_buffer is not transferred, but since we no longer
7599              * run in parallel mode this should not be an issue. */
7600             h->nal_unit_type = hx->nal_unit_type;
7601             h->nal_ref_idc   = hx->nal_ref_idc;
7602             hx = h;
7603             goto again;
7604         }
7605     }
7606     if(context_count)
7607         execute_decode_slices(h, context_count);
7608     return buf_index;
7609 }
7610
7611 /**
7612  * returns the number of bytes consumed for building the current frame
7613  */
7614 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7615         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7616         if(pos+10>buf_size) pos=buf_size; // oops ;)
7617
7618         return pos;
7619 }
7620
7621 static int decode_frame(AVCodecContext *avctx,
7622                              void *data, int *data_size,
7623                              AVPacket *avpkt)
7624 {
7625     const uint8_t *buf = avpkt->data;
7626     int buf_size = avpkt->size;
7627     H264Context *h = avctx->priv_data;
7628     MpegEncContext *s = &h->s;
7629     AVFrame *pict = data;
7630     int buf_index;
7631
7632     s->flags= avctx->flags;
7633     s->flags2= avctx->flags2;
7634
7635    /* end of stream, output what is still in the buffers */
7636     if (buf_size == 0) {
7637         Picture *out;
7638         int i, out_idx;
7639
7640 //FIXME factorize this with the output code below
7641         out = h->delayed_pic[0];
7642         out_idx = 0;
7643         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7644             if(h->delayed_pic[i]->poc < out->poc){
7645                 out = h->delayed_pic[i];
7646                 out_idx = i;
7647             }
7648
7649         for(i=out_idx; h->delayed_pic[i]; i++)
7650             h->delayed_pic[i] = h->delayed_pic[i+1];
7651
7652         if(out){
7653             *data_size = sizeof(AVFrame);
7654             *pict= *(AVFrame*)out;
7655         }
7656
7657         return 0;
7658     }
7659
7660     if(h->is_avc && !h->got_avcC) {
7661         int i, cnt, nalsize;
7662         unsigned char *p = avctx->extradata;
7663         if(avctx->extradata_size < 7) {
7664             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7665             return -1;
7666         }
7667         if(*p != 1) {
7668             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7669             return -1;
7670         }
7671         /* sps and pps in the avcC always have length coded with 2 bytes,
7672            so put a fake nal_length_size = 2 while parsing them */
7673         h->nal_length_size = 2;
7674         // Decode sps from avcC
7675         cnt = *(p+5) & 0x1f; // Number of sps
7676         p += 6;
7677         for (i = 0; i < cnt; i++) {
7678             nalsize = AV_RB16(p) + 2;
7679             if(decode_nal_units(h, p, nalsize) < 0) {
7680                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7681                 return -1;
7682             }
7683             p += nalsize;
7684         }
7685         // Decode pps from avcC
7686         cnt = *(p++); // Number of pps
7687         for (i = 0; i < cnt; i++) {
7688             nalsize = AV_RB16(p) + 2;
7689             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7690                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7691                 return -1;
7692             }
7693             p += nalsize;
7694         }
7695         // Now store right nal length size, that will be use to parse all other nals
7696         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7697         // Do not reparse avcC
7698         h->got_avcC = 1;
7699     }
7700
7701     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7702         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7703             return -1;
7704         h->got_avcC = 1;
7705     }
7706
7707     buf_index=decode_nal_units(h, buf, buf_size);
7708     if(buf_index < 0)
7709         return -1;
7710
7711     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7712         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7713         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7714         return -1;
7715     }
7716
7717     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7718         Picture *out = s->current_picture_ptr;
7719         Picture *cur = s->current_picture_ptr;
7720         int i, pics, cross_idr, out_of_order, out_idx;
7721
7722         s->mb_y= 0;
7723
7724         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7725         s->current_picture_ptr->pict_type= s->pict_type;
7726
7727         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7728             ff_vdpau_h264_set_reference_frames(s);
7729
7730         if(!s->dropable) {
7731             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7732             h->prev_poc_msb= h->poc_msb;
7733             h->prev_poc_lsb= h->poc_lsb;
7734         }
7735         h->prev_frame_num_offset= h->frame_num_offset;
7736         h->prev_frame_num= h->frame_num;
7737
7738         if (avctx->hwaccel) {
7739             if (avctx->hwaccel->end_frame(avctx) < 0)
7740                 av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
7741         }
7742
7743         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7744             ff_vdpau_h264_picture_complete(s);
7745
7746         /*
7747          * FIXME: Error handling code does not seem to support interlaced
7748          * when slices span multiple rows
7749          * The ff_er_add_slice calls don't work right for bottom
7750          * fields; they cause massive erroneous error concealing
7751          * Error marking covers both fields (top and bottom).
7752          * This causes a mismatched s->error_count
7753          * and a bad error table. Further, the error count goes to
7754          * INT_MAX when called for bottom field, because mb_y is
7755          * past end by one (callers fault) and resync_mb_y != 0
7756          * causes problems for the first MB line, too.
7757          */
7758         if (!FIELD_PICTURE)
7759             ff_er_frame_end(s);
7760
7761         MPV_frame_end(s);
7762
7763         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7764             /* Wait for second field. */
7765             *data_size = 0;
7766
7767         } else {
7768             cur->repeat_pict = 0;
7769
7770             /* Signal interlacing information externally. */
7771             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7772             if (h->sei_ct_type)
7773                 cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7774             else
7775                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7776
7777             if(h->sps.pic_struct_present_flag){
7778                 switch (h->sei_pic_struct)
7779                 {
7780                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7781                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7782                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7783                     // From these hints, let the applications decide if they apply deinterlacing.
7784                     cur->repeat_pict = 1;
7785                     break;
7786                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7787                     // Force progressive here, as doubling interlaced frame is a bad idea.
7788                     cur->interlaced_frame = 0;
7789                     cur->repeat_pict = 2;
7790                     break;
7791                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7792                     cur->interlaced_frame = 0;
7793                     cur->repeat_pict = 4;
7794                     break;
7795                 }
7796             }else{
7797                 /* Derive interlacing flag from used decoding process. */
7798                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7799             }
7800
7801             if (cur->field_poc[0] != cur->field_poc[1]){
7802                 /* Derive top_field_first from field pocs. */
7803                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7804             }else{
7805                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7806                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7807                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7808                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7809                         cur->top_field_first = 1;
7810                     else
7811                         cur->top_field_first = 0;
7812                 }else{
7813                     /* Most likely progressive */
7814                     cur->top_field_first = 0;
7815                 }
7816             }
7817
7818         //FIXME do something with unavailable reference frames
7819
7820             /* Sort B-frames into display order */
7821
7822             if(h->sps.bitstream_restriction_flag
7823                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7824                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7825                 s->low_delay = 0;
7826             }
7827
7828             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7829                && !h->sps.bitstream_restriction_flag){
7830                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7831                 s->low_delay= 0;
7832             }
7833
7834             pics = 0;
7835             while(h->delayed_pic[pics]) pics++;
7836
7837             assert(pics <= MAX_DELAYED_PIC_COUNT);
7838
7839             h->delayed_pic[pics++] = cur;
7840             if(cur->reference == 0)
7841                 cur->reference = DELAYED_PIC_REF;
7842
7843             out = h->delayed_pic[0];
7844             out_idx = 0;
7845             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7846                 if(h->delayed_pic[i]->poc < out->poc){
7847                     out = h->delayed_pic[i];
7848                     out_idx = i;
7849                 }
7850             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7851
7852             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7853
7854             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7855                 { }
7856             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7857                || (s->low_delay &&
7858                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7859                  || cur->pict_type == FF_B_TYPE)))
7860             {
7861                 s->low_delay = 0;
7862                 s->avctx->has_b_frames++;
7863             }
7864
7865             if(out_of_order || pics > s->avctx->has_b_frames){
7866                 out->reference &= ~DELAYED_PIC_REF;
7867                 for(i=out_idx; h->delayed_pic[i]; i++)
7868                     h->delayed_pic[i] = h->delayed_pic[i+1];
7869             }
7870             if(!out_of_order && pics > s->avctx->has_b_frames){
7871                 *data_size = sizeof(AVFrame);
7872
7873                 h->outputed_poc = out->poc;
7874                 *pict= *(AVFrame*)out;
7875             }else{
7876                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7877             }
7878         }
7879     }
7880
7881     assert(pict->data[0] || !*data_size);
7882     ff_print_debug_info(s, pict);
7883 //printf("out %d\n", (int)pict->data[0]);
7884 #if 0 //?
7885
7886     /* Return the Picture timestamp as the frame number */
7887     /* we subtract 1 because it is added on utils.c     */
7888     avctx->frame_number = s->picture_number - 1;
7889 #endif
7890     return get_consumed_bytes(s, buf_index, buf_size);
7891 }
7892 #if 0
7893 static inline void fill_mb_avail(H264Context *h){
7894     MpegEncContext * const s = &h->s;
7895     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7896
7897     if(s->mb_y){
7898         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7899         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7900         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7901     }else{
7902         h->mb_avail[0]=
7903         h->mb_avail[1]=
7904         h->mb_avail[2]= 0;
7905     }
7906     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7907     h->mb_avail[4]= 1; //FIXME move out
7908     h->mb_avail[5]= 0; //FIXME move out
7909 }
7910 #endif
7911
7912 #ifdef TEST
7913 #undef printf
7914 #undef random
7915 #define COUNT 8000
7916 #define SIZE (COUNT*40)
7917 int main(void){
7918     int i;
7919     uint8_t temp[SIZE];
7920     PutBitContext pb;
7921     GetBitContext gb;
7922 //    int int_temp[10000];
7923     DSPContext dsp;
7924     AVCodecContext avctx;
7925
7926     dsputil_init(&dsp, &avctx);
7927
7928     init_put_bits(&pb, temp, SIZE);
7929     printf("testing unsigned exp golomb\n");
7930     for(i=0; i<COUNT; i++){
7931         START_TIMER
7932         set_ue_golomb(&pb, i);
7933         STOP_TIMER("set_ue_golomb");
7934     }
7935     flush_put_bits(&pb);
7936
7937     init_get_bits(&gb, temp, 8*SIZE);
7938     for(i=0; i<COUNT; i++){
7939         int j, s;
7940
7941         s= show_bits(&gb, 24);
7942
7943         START_TIMER
7944         j= get_ue_golomb(&gb);
7945         if(j != i){
7946             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7947 //            return -1;
7948         }
7949         STOP_TIMER("get_ue_golomb");
7950     }
7951
7952
7953     init_put_bits(&pb, temp, SIZE);
7954     printf("testing signed exp golomb\n");
7955     for(i=0; i<COUNT; i++){
7956         START_TIMER
7957         set_se_golomb(&pb, i - COUNT/2);
7958         STOP_TIMER("set_se_golomb");
7959     }
7960     flush_put_bits(&pb);
7961
7962     init_get_bits(&gb, temp, 8*SIZE);
7963     for(i=0; i<COUNT; i++){
7964         int j, s;
7965
7966         s= show_bits(&gb, 24);
7967
7968         START_TIMER
7969         j= get_se_golomb(&gb);
7970         if(j != i - COUNT/2){
7971             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7972 //            return -1;
7973         }
7974         STOP_TIMER("get_se_golomb");
7975     }
7976
7977 #if 0
7978     printf("testing 4x4 (I)DCT\n");
7979
7980     DCTELEM block[16];
7981     uint8_t src[16], ref[16];
7982     uint64_t error= 0, max_error=0;
7983
7984     for(i=0; i<COUNT; i++){
7985         int j;
7986 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7987         for(j=0; j<16; j++){
7988             ref[j]= random()%255;
7989             src[j]= random()%255;
7990         }
7991
7992         h264_diff_dct_c(block, src, ref, 4);
7993
7994         //normalize
7995         for(j=0; j<16; j++){
7996 //            printf("%d ", block[j]);
7997             block[j]= block[j]*4;
7998             if(j&1) block[j]= (block[j]*4 + 2)/5;
7999             if(j&4) block[j]= (block[j]*4 + 2)/5;
8000         }
8001 //        printf("\n");
8002
8003         s->dsp.h264_idct_add(ref, block, 4);
8004 /*        for(j=0; j<16; j++){
8005             printf("%d ", ref[j]);
8006         }
8007         printf("\n");*/
8008
8009         for(j=0; j<16; j++){
8010             int diff= FFABS(src[j] - ref[j]);
8011
8012             error+= diff*diff;
8013             max_error= FFMAX(max_error, diff);
8014         }
8015     }
8016     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8017     printf("testing quantizer\n");
8018     for(qp=0; qp<52; qp++){
8019         for(i=0; i<16; i++)
8020             src1_block[i]= src2_block[i]= random()%255;
8021
8022     }
8023     printf("Testing NAL layer\n");
8024
8025     uint8_t bitstream[COUNT];
8026     uint8_t nal[COUNT*2];
8027     H264Context h;
8028     memset(&h, 0, sizeof(H264Context));
8029
8030     for(i=0; i<COUNT; i++){
8031         int zeros= i;
8032         int nal_length;
8033         int consumed;
8034         int out_length;
8035         uint8_t *out;
8036         int j;
8037
8038         for(j=0; j<COUNT; j++){
8039             bitstream[j]= (random() % 255) + 1;
8040         }
8041
8042         for(j=0; j<zeros; j++){
8043             int pos= random() % COUNT;
8044             while(bitstream[pos] == 0){
8045                 pos++;
8046                 pos %= COUNT;
8047             }
8048             bitstream[pos]=0;
8049         }
8050
8051         START_TIMER
8052
8053         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8054         if(nal_length<0){
8055             printf("encoding failed\n");
8056             return -1;
8057         }
8058
8059         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8060
8061         STOP_TIMER("NAL")
8062
8063         if(out_length != COUNT){
8064             printf("incorrect length %d %d\n", out_length, COUNT);
8065             return -1;
8066         }
8067
8068         if(consumed != nal_length){
8069             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8070             return -1;
8071         }
8072
8073         if(memcmp(bitstream, out, COUNT)){
8074             printf("mismatch\n");
8075             return -1;
8076         }
8077     }
8078 #endif
8079
8080     printf("Testing RBSP\n");
8081
8082
8083     return 0;
8084 }
8085 #endif /* TEST */
8086
8087
8088 av_cold void ff_h264_free_context(H264Context *h)
8089 {
8090     int i;
8091
8092     av_freep(&h->rbsp_buffer[0]);
8093     av_freep(&h->rbsp_buffer[1]);
8094     free_tables(h); //FIXME cleanup init stuff perhaps
8095
8096     for(i = 0; i < MAX_SPS_COUNT; i++)
8097         av_freep(h->sps_buffers + i);
8098
8099     for(i = 0; i < MAX_PPS_COUNT; i++)
8100         av_freep(h->pps_buffers + i);
8101 }
8102
8103 static av_cold int decode_end(AVCodecContext *avctx)
8104 {
8105     H264Context *h = avctx->priv_data;
8106     MpegEncContext *s = &h->s;
8107
8108     ff_h264_free_context(h);
8109
8110     MPV_common_end(s);
8111
8112 //    memset(h, 0, sizeof(H264Context));
8113
8114     return 0;
8115 }
8116
8117
8118 AVCodec h264_decoder = {
8119     "h264",
8120     CODEC_TYPE_VIDEO,
8121     CODEC_ID_H264,
8122     sizeof(H264Context),
8123     decode_init,
8124     NULL,
8125     decode_end,
8126     decode_frame,
8127     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8128     .flush= flush_dpb,
8129     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8130     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8131 };
8132
8133 #if CONFIG_H264_VDPAU_DECODER
8134 AVCodec h264_vdpau_decoder = {
8135     "h264_vdpau",
8136     CODEC_TYPE_VIDEO,
8137     CODEC_ID_H264,
8138     sizeof(H264Context),
8139     decode_init,
8140     NULL,
8141     decode_end,
8142     decode_frame,
8143     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8144     .flush= flush_dpb,
8145     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8146 };
8147 #endif
8148
8149 #if CONFIG_SVQ3_DECODER
8150 #include "svq3.c"
8151 #endif