libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 997
 998     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 999         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1000             int cur_poc = s->current_picture_ptr->poc;
1001             int *col_poc = h->ref_list[1]->field_poc;
1002             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1003             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1004             b8_stride = 0;
1005         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1006             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1007             mb_xy += s->mb_stride*fieldoff;
1008         }
1009         goto single_col;
1010     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1011         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1012             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1013             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1014             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1015             b8_stride *= 3;
1016             b4_stride *= 6;
1017             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1018             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1019                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1020                 && !is_b8x8){
1021                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1022                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1023             }else{
1024                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1025                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1026             }
1027         }else{                                           //     AFR/FR    -> AFR/FR
1028 single_col:
1029             mb_type_col[0] =
1030             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1031             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1032                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1033                 * so we know exactly what block size to use */
1034                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1035                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1036             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1037                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1038                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1039             }else{
1040                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }
1043         }
1044     }
1045
1046     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1047     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1048     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1049     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1050     if(!b8_stride){
1051         if(s->mb_y&1){
1052             l1ref0 += h->b8_stride;
1053             l1ref1 += h->b8_stride;
1054             l1mv0  +=  2*b4_stride;
1055             l1mv1  +=  2*b4_stride;
1056         }
1057     }
1058
1059     if(h->direct_spatial_mv_pred){
1060         int ref[2];
1061         int mv[2][2];
1062         int list;
1063
1064         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1065
1066         /* ref = min(neighbors) */
1067         for(list=0; list<2; list++){
1068             int refa = h->ref_cache[list][scan8[0] - 1];
1069             int refb = h->ref_cache[list][scan8[0] - 8];
1070             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1071             if(refc == PART_NOT_AVAILABLE)
1072                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1073             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1074             if(ref[list] < 0)
1075                 ref[list] = -1;
1076         }
1077
1078         if(ref[0] < 0 && ref[1] < 0){
1079             ref[0] = ref[1] = 0;
1080             mv[0][0] = mv[0][1] =
1081             mv[1][0] = mv[1][1] = 0;
1082         }else{
1083             for(list=0; list<2; list++){
1084                 if(ref[list] >= 0)
1085                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1086                 else
1087                     mv[list][0] = mv[list][1] = 0;
1088             }
1089         }
1090
1091         if(ref[1] < 0){
1092             if(!is_b8x8)
1093                 *mb_type &= ~MB_TYPE_L1;
1094             sub_mb_type &= ~MB_TYPE_L1;
1095         }else if(ref[0] < 0){
1096             if(!is_b8x8)
1097                 *mb_type &= ~MB_TYPE_L0;
1098             sub_mb_type &= ~MB_TYPE_L0;
1099         }
1100
1101         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1102             for(i8=0; i8<4; i8++){
1103                 int x8 = i8&1;
1104                 int y8 = i8>>1;
1105                 int xy8 = x8+y8*b8_stride;
1106                 int xy4 = 3*x8+y8*b4_stride;
1107                 int a=0, b=0;
1108
1109                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1110                     continue;
1111                 h->sub_mb_type[i8] = sub_mb_type;
1112
1113                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1114                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1115                 if(!IS_INTRA(mb_type_col[y8])
1116                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1117                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1118                     if(ref[0] > 0)
1119                         a= pack16to32(mv[0][0],mv[0][1]);
1120                     if(ref[1] > 0)
1121                         b= pack16to32(mv[1][0],mv[1][1]);
1122                 }else{
1123                     a= pack16to32(mv[0][0],mv[0][1]);
1124                     b= pack16to32(mv[1][0],mv[1][1]);
1125                 }
1126                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1127                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1128             }
1129         }else if(IS_16X16(*mb_type)){
1130             int a=0, b=0;
1131
1132             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1133             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1134             if(!IS_INTRA(mb_type_col[0])
1135                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1136                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1137                        && (h->x264_build>33 || !h->x264_build)))){
1138                 if(ref[0] > 0)
1139                     a= pack16to32(mv[0][0],mv[0][1]);
1140                 if(ref[1] > 0)
1141                     b= pack16to32(mv[1][0],mv[1][1]);
1142             }else{
1143                 a= pack16to32(mv[0][0],mv[0][1]);
1144                 b= pack16to32(mv[1][0],mv[1][1]);
1145             }
1146             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1147             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1148         }else{
1149             for(i8=0; i8<4; i8++){
1150                 const int x8 = i8&1;
1151                 const int y8 = i8>>1;
1152
1153                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1154                     continue;
1155                 h->sub_mb_type[i8] = sub_mb_type;
1156
1157                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1158                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1159                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1160                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1161
1162                 /* col_zero_flag */
1163                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1164                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1165                                                   && (h->x264_build>33 || !h->x264_build)))){
1166                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1167                     if(IS_SUB_8X8(sub_mb_type)){
1168                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1169                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1170                             if(ref[0] == 0)
1171                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1172                             if(ref[1] == 0)
1173                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                         }
1175                     }else
1176                     for(i4=0; i4<4; i4++){
1177                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1178                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1179                             if(ref[0] == 0)
1180                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1181                             if(ref[1] == 0)
1182                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1183                         }
1184                     }
1185                 }
1186             }
1187         }
1188     }else{ /* direct temporal mv pred */
1189         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1190         const int *dist_scale_factor = h->dist_scale_factor;
1191         int ref_offset= 0;
1192
1193         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1194             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1195             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1196             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1197         }
1198         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1199             ref_offset += 16;
1200
1201         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1202             /* FIXME assumes direct_8x8_inference == 1 */
1203             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1204
1205             for(i8=0; i8<4; i8++){
1206                 const int x8 = i8&1;
1207                 const int y8 = i8>>1;
1208                 int ref0, scale;
1209                 const int16_t (*l1mv)[2]= l1mv0;
1210
1211                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1212                     continue;
1213                 h->sub_mb_type[i8] = sub_mb_type;
1214
1215                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1216                 if(IS_INTRA(mb_type_col[y8])){
1217                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1219                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1220                     continue;
1221                 }
1222
1223                 ref0 = l1ref0[x8 + y8*b8_stride];
1224                 if(ref0 >= 0)
1225                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1226                 else{
1227                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1228                     l1mv= l1mv1;
1229                 }
1230                 scale = dist_scale_factor[ref0];
1231                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1232
1233                 {
1234                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1235                     int my_col = (mv_col[1]<<y_shift)/2;
1236                     int mx = (scale * mv_col[0] + 128) >> 8;
1237                     int my = (scale * my_col + 128) >> 8;
1238                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1239                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1240                 }
1241             }
1242             return;
1243         }
1244
1245         /* one-to-one mv scaling */
1246
1247         if(IS_16X16(*mb_type)){
1248             int ref, mv0, mv1;
1249
1250             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1251             if(IS_INTRA(mb_type_col[0])){
1252                 ref=mv0=mv1=0;
1253             }else{
1254                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1255                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1256                 const int scale = dist_scale_factor[ref0];
1257                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1258                 int mv_l0[2];
1259                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1260                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1261                 ref= ref0;
1262                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1263                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1264             }
1265             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1266             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1267             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1268         }else{
1269             for(i8=0; i8<4; i8++){
1270                 const int x8 = i8&1;
1271                 const int y8 = i8>>1;
1272                 int ref0, scale;
1273                 const int16_t (*l1mv)[2]= l1mv0;
1274
1275                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1276                     continue;
1277                 h->sub_mb_type[i8] = sub_mb_type;
1278                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1279                 if(IS_INTRA(mb_type_col[0])){
1280                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1282                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1283                     continue;
1284                 }
1285
1286                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1287                 if(ref0 >= 0)
1288                     ref0 = map_col_to_list0[0][ref0];
1289                 else{
1290                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1291                     l1mv= l1mv1;
1292                 }
1293                 scale = dist_scale_factor[ref0];
1294
1295                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1296                 if(IS_SUB_8X8(sub_mb_type)){
1297                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1298                     int mx = (scale * mv_col[0] + 128) >> 8;
1299                     int my = (scale * mv_col[1] + 128) >> 8;
1300                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1301                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1302                 }else
1303                 for(i4=0; i4<4; i4++){
1304                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1305                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1306                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1307                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1308                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1309                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1310                 }
1311             }
1312         }
1313     }
1314 }
1315
1316 static inline void write_back_motion(H264Context *h, int mb_type){
1317     MpegEncContext * const s = &h->s;
1318     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1319     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1320     int list;
1321
1322     if(!USES_LIST(mb_type, 0))
1323         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1324
1325     for(list=0; list<h->list_count; list++){
1326         int y;
1327         if(!USES_LIST(mb_type, list))
1328             continue;
1329
1330         for(y=0; y<4; y++){
1331             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1332             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1333         }
1334         if( h->pps.cabac ) {
1335             if(IS_SKIP(mb_type))
1336                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1337             else
1338             for(y=0; y<4; y++){
1339                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1340                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1341             }
1342         }
1343
1344         {
1345             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1346             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1347             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1348             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1349             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1350         }
1351     }
1352
1353     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1354         if(IS_8X8(mb_type)){
1355             uint8_t *direct_table = &h->direct_table[b8_xy];
1356             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1357             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1358             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1359         }
1360     }
1361 }
1362
1363 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1364     int i, si, di;
1365     uint8_t *dst;
1366     int bufidx;
1367
1368 //    src[0]&0x80;                //forbidden bit
1369     h->nal_ref_idc= src[0]>>5;
1370     h->nal_unit_type= src[0]&0x1F;
1371
1372     src++; length--;
1373 #if 0
1374     for(i=0; i<length; i++)
1375         printf("%2X ", src[i]);
1376 #endif
1377
1378 #if HAVE_FAST_UNALIGNED
1379 # if HAVE_FAST_64BIT
1380 #   define RS 7
1381     for(i=0; i+1<length; i+=9){
1382         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1383 # else
1384 #   define RS 3
1385     for(i=0; i+1<length; i+=5){
1386         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1387 # endif
1388             continue;
1389         if(i>0 && !src[i]) i--;
1390         while(src[i]) i++;
1391 #else
1392 #   define RS 0
1393     for(i=0; i+1<length; i+=2){
1394         if(src[i]) continue;
1395         if(i>0 && src[i-1]==0) i--;
1396 #endif
1397         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1398             if(src[i+2]!=3){
1399                 /* startcode, so we must be past the end */
1400                 length=i;
1401             }
1402             break;
1403         }
1404         i-= RS;
1405     }
1406
1407     if(i>=length-1){ //no escaped 0
1408         *dst_length= length;
1409         *consumed= length+1; //+1 for the header
1410         return src;
1411     }
1412
1413     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1414     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1415     dst= h->rbsp_buffer[bufidx];
1416
1417     if (dst == NULL){
1418         return NULL;
1419     }
1420
1421 //printf("decoding esc\n");
1422     memcpy(dst, src, i);
1423     si=di=i;
1424     while(si+2<length){
1425         //remove escapes (very rare 1:2^22)
1426         if(src[si+2]>3){
1427             dst[di++]= src[si++];
1428             dst[di++]= src[si++];
1429         }else if(src[si]==0 && src[si+1]==0){
1430             if(src[si+2]==3){ //escape
1431                 dst[di++]= 0;
1432                 dst[di++]= 0;
1433                 si+=3;
1434                 continue;
1435             }else //next start code
1436                 goto nsc;
1437         }
1438
1439         dst[di++]= src[si++];
1440     }
1441     while(si<length)
1442         dst[di++]= src[si++];
1443 nsc:
1444
1445     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1446
1447     *dst_length= di;
1448     *consumed= si + 1;//+1 for the header
1449 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1450     return dst;
1451 }
1452
1453 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1454     int v= *src;
1455     int r;
1456
1457     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1458
1459     for(r=1; r<9; r++){
1460         if(v&1) return r;
1461         v>>=1;
1462     }
1463     return 0;
1464 }
1465
1466 /**
1467  * IDCT transforms the 16 dc values and dequantizes them.
1468  * @param qp quantization parameter
1469  */
1470 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1471 #define stride 16
1472     int i;
1473     int temp[16]; //FIXME check if this is a good idea
1474     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1475     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1476
1477 //memset(block, 64, 2*256);
1478 //return;
1479     for(i=0; i<4; i++){
1480         const int offset= y_offset[i];
1481         const int z0= block[offset+stride*0] + block[offset+stride*4];
1482         const int z1= block[offset+stride*0] - block[offset+stride*4];
1483         const int z2= block[offset+stride*1] - block[offset+stride*5];
1484         const int z3= block[offset+stride*1] + block[offset+stride*5];
1485
1486         temp[4*i+0]= z0+z3;
1487         temp[4*i+1]= z1+z2;
1488         temp[4*i+2]= z1-z2;
1489         temp[4*i+3]= z0-z3;
1490     }
1491
1492     for(i=0; i<4; i++){
1493         const int offset= x_offset[i];
1494         const int z0= temp[4*0+i] + temp[4*2+i];
1495         const int z1= temp[4*0+i] - temp[4*2+i];
1496         const int z2= temp[4*1+i] - temp[4*3+i];
1497         const int z3= temp[4*1+i] + temp[4*3+i];
1498
1499         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1500         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1501         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1502         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1503     }
1504 }
1505
1506 #if 0
1507 /**
1508  * DCT transforms the 16 dc values.
1509  * @param qp quantization parameter ??? FIXME
1510  */
1511 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1512 //    const int qmul= dequant_coeff[qp][0];
1513     int i;
1514     int temp[16]; //FIXME check if this is a good idea
1515     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1516     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1517
1518     for(i=0; i<4; i++){
1519         const int offset= y_offset[i];
1520         const int z0= block[offset+stride*0] + block[offset+stride*4];
1521         const int z1= block[offset+stride*0] - block[offset+stride*4];
1522         const int z2= block[offset+stride*1] - block[offset+stride*5];
1523         const int z3= block[offset+stride*1] + block[offset+stride*5];
1524
1525         temp[4*i+0]= z0+z3;
1526         temp[4*i+1]= z1+z2;
1527         temp[4*i+2]= z1-z2;
1528         temp[4*i+3]= z0-z3;
1529     }
1530
1531     for(i=0; i<4; i++){
1532         const int offset= x_offset[i];
1533         const int z0= temp[4*0+i] + temp[4*2+i];
1534         const int z1= temp[4*0+i] - temp[4*2+i];
1535         const int z2= temp[4*1+i] - temp[4*3+i];
1536         const int z3= temp[4*1+i] + temp[4*3+i];
1537
1538         block[stride*0 +offset]= (z0 + z3)>>1;
1539         block[stride*2 +offset]= (z1 + z2)>>1;
1540         block[stride*8 +offset]= (z1 - z2)>>1;
1541         block[stride*10+offset]= (z0 - z3)>>1;
1542     }
1543 }
1544 #endif
1545
1546 #undef xStride
1547 #undef stride
1548
1549 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1550     const int stride= 16*2;
1551     const int xStride= 16;
1552     int a,b,c,d,e;
1553
1554     a= block[stride*0 + xStride*0];
1555     b= block[stride*0 + xStride*1];
1556     c= block[stride*1 + xStride*0];
1557     d= block[stride*1 + xStride*1];
1558
1559     e= a-b;
1560     a= a+b;
1561     b= c-d;
1562     c= c+d;
1563
1564     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1565     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1566     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1567     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1568 }
1569
1570 #if 0
1571 static void chroma_dc_dct_c(DCTELEM *block){
1572     const int stride= 16*2;
1573     const int xStride= 16;
1574     int a,b,c,d,e;
1575
1576     a= block[stride*0 + xStride*0];
1577     b= block[stride*0 + xStride*1];
1578     c= block[stride*1 + xStride*0];
1579     d= block[stride*1 + xStride*1];
1580
1581     e= a-b;
1582     a= a+b;
1583     b= c-d;
1584     c= c+d;
1585
1586     block[stride*0 + xStride*0]= (a+c);
1587     block[stride*0 + xStride*1]= (e+b);
1588     block[stride*1 + xStride*0]= (a-c);
1589     block[stride*1 + xStride*1]= (e-b);
1590 }
1591 #endif
1592
1593 /**
1594  * gets the chroma qp.
1595  */
1596 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1597     return h->pps.chroma_qp_table[t][qscale];
1598 }
1599
1600 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1601                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1602                            int src_x_offset, int src_y_offset,
1603                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1604     MpegEncContext * const s = &h->s;
1605     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1606     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1607     const int luma_xy= (mx&3) + ((my&3)<<2);
1608     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1609     uint8_t * src_cb, * src_cr;
1610     int extra_width= h->emu_edge_width;
1611     int extra_height= h->emu_edge_height;
1612     int emu=0;
1613     const int full_mx= mx>>2;
1614     const int full_my= my>>2;
1615     const int pic_width  = 16*s->mb_width;
1616     const int pic_height = 16*s->mb_height >> MB_FIELD;
1617
1618     if(mx&7) extra_width -= 3;
1619     if(my&7) extra_height -= 3;
1620
1621     if(   full_mx < 0-extra_width
1622        || full_my < 0-extra_height
1623        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1624        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1625         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1626             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1627         emu=1;
1628     }
1629
1630     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1631     if(!square){
1632         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1633     }
1634
1635     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1636
1637     if(MB_FIELD){
1638         // chroma offset when predicting from a field of opposite parity
1639         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1640         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1641     }
1642     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1643     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1644
1645     if(emu){
1646         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1647             src_cb= s->edge_emu_buffer;
1648     }
1649     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1650
1651     if(emu){
1652         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1653             src_cr= s->edge_emu_buffer;
1654     }
1655     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1656 }
1657
1658 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1659                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1660                            int x_offset, int y_offset,
1661                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1662                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1663                            int list0, int list1){
1664     MpegEncContext * const s = &h->s;
1665     qpel_mc_func *qpix_op=  qpix_put;
1666     h264_chroma_mc_func chroma_op= chroma_put;
1667
1668     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1669     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1670     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1671     x_offset += 8*s->mb_x;
1672     y_offset += 8*(s->mb_y >> MB_FIELD);
1673
1674     if(list0){
1675         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1676         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1677                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1678                            qpix_op, chroma_op);
1679
1680         qpix_op=  qpix_avg;
1681         chroma_op= chroma_avg;
1682     }
1683
1684     if(list1){
1685         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1686         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1687                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1688                            qpix_op, chroma_op);
1689     }
1690 }
1691
1692 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1693                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1694                            int x_offset, int y_offset,
1695                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1696                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1697                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1698                            int list0, int list1){
1699     MpegEncContext * const s = &h->s;
1700
1701     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1702     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1703     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1704     x_offset += 8*s->mb_x;
1705     y_offset += 8*(s->mb_y >> MB_FIELD);
1706
1707     if(list0 && list1){
1708         /* don't optimize for luma-only case, since B-frames usually
1709          * use implicit weights => chroma too. */
1710         uint8_t *tmp_cb = s->obmc_scratchpad;
1711         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1712         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1713         int refn0 = h->ref_cache[0][ scan8[n] ];
1714         int refn1 = h->ref_cache[1][ scan8[n] ];
1715
1716         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1717                     dest_y, dest_cb, dest_cr,
1718                     x_offset, y_offset, qpix_put, chroma_put);
1719         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1720                     tmp_y, tmp_cb, tmp_cr,
1721                     x_offset, y_offset, qpix_put, chroma_put);
1722
1723         if(h->use_weight == 2){
1724             int weight0 = h->implicit_weight[refn0][refn1];
1725             int weight1 = 64 - weight0;
1726             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1727             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1729         }else{
1730             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1731                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1732                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1733             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1734                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1735                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1736             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1737                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1738                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1739         }
1740     }else{
1741         int list = list1 ? 1 : 0;
1742         int refn = h->ref_cache[list][ scan8[n] ];
1743         Picture *ref= &h->ref_list[list][refn];
1744         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1745                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1746                     qpix_put, chroma_put);
1747
1748         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1749                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1750         if(h->use_weight_chroma){
1751             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1752                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1753             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1755         }
1756     }
1757 }
1758
1759 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1760                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1761                            int x_offset, int y_offset,
1762                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1763                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1764                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1765                            int list0, int list1){
1766     if((h->use_weight==2 && list0 && list1
1767         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1768        || h->use_weight==1)
1769         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1770                          x_offset, y_offset, qpix_put, chroma_put,
1771                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1772     else
1773         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1774                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1775 }
1776
1777 static inline void prefetch_motion(H264Context *h, int list){
1778     /* fetch pixels for estimated mv 4 macroblocks ahead
1779      * optimized for 64byte cache lines */
1780     MpegEncContext * const s = &h->s;
1781     const int refn = h->ref_cache[list][scan8[0]];
1782     if(refn >= 0){
1783         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1784         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1785         uint8_t **src= h->ref_list[list][refn].data;
1786         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1787         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1788         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1789         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1790     }
1791 }
1792
1793 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1794                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1795                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1796                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1797     MpegEncContext * const s = &h->s;
1798     const int mb_xy= h->mb_xy;
1799     const int mb_type= s->current_picture.mb_type[mb_xy];
1800
1801     assert(IS_INTER(mb_type));
1802
1803     prefetch_motion(h, 0);
1804
1805     if(IS_16X16(mb_type)){
1806         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1807                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1808                 &weight_op[0], &weight_avg[0],
1809                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1810     }else if(IS_16X8(mb_type)){
1811         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1812                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1813                 &weight_op[1], &weight_avg[1],
1814                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1815         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1816                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1817                 &weight_op[1], &weight_avg[1],
1818                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1819     }else if(IS_8X16(mb_type)){
1820         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1821                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1822                 &weight_op[2], &weight_avg[2],
1823                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1824         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1825                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1826                 &weight_op[2], &weight_avg[2],
1827                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1828     }else{
1829         int i;
1830
1831         assert(IS_8X8(mb_type));
1832
1833         for(i=0; i<4; i++){
1834             const int sub_mb_type= h->sub_mb_type[i];
1835             const int n= 4*i;
1836             int x_offset= (i&1)<<2;
1837             int y_offset= (i&2)<<1;
1838
1839             if(IS_SUB_8X8(sub_mb_type)){
1840                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1841                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1842                     &weight_op[3], &weight_avg[3],
1843                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1844             }else if(IS_SUB_8X4(sub_mb_type)){
1845                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1846                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1847                     &weight_op[4], &weight_avg[4],
1848                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1849                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1850                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1851                     &weight_op[4], &weight_avg[4],
1852                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1853             }else if(IS_SUB_4X8(sub_mb_type)){
1854                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1855                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1856                     &weight_op[5], &weight_avg[5],
1857                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1858                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1859                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1860                     &weight_op[5], &weight_avg[5],
1861                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862             }else{
1863                 int j;
1864                 assert(IS_SUB_4X4(sub_mb_type));
1865                 for(j=0; j<4; j++){
1866                     int sub_x_offset= x_offset + 2*(j&1);
1867                     int sub_y_offset= y_offset +   (j&2);
1868                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1869                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1870                         &weight_op[6], &weight_avg[6],
1871                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1872                 }
1873             }
1874         }
1875     }
1876
1877     prefetch_motion(h, 1);
1878 }
1879
1880 static av_cold void init_cavlc_level_tab(void){
1881     int suffix_length, mask;
1882     unsigned int i;
1883
1884     for(suffix_length=0; suffix_length<7; suffix_length++){
1885         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1886             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1887             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1888
1889             mask= -(level_code&1);
1890             level_code= (((2+level_code)>>1) ^ mask) - mask;
1891             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1892                 cavlc_level_tab[suffix_length][i][0]= level_code;
1893                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1894             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1895                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1896                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1897             }else{
1898                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1899                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1900             }
1901         }
1902     }
1903 }
1904
1905 static av_cold void decode_init_vlc(void){
1906     static int done = 0;
1907
1908     if (!done) {
1909         int i;
1910         int offset;
1911         done = 1;
1912
1913         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1914         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1915         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1916                  &chroma_dc_coeff_token_len [0], 1, 1,
1917                  &chroma_dc_coeff_token_bits[0], 1, 1,
1918                  INIT_VLC_USE_NEW_STATIC);
1919
1920         offset = 0;
1921         for(i=0; i<4; i++){
1922             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1923             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1924             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1925                      &coeff_token_len [i][0], 1, 1,
1926                      &coeff_token_bits[i][0], 1, 1,
1927                      INIT_VLC_USE_NEW_STATIC);
1928             offset += coeff_token_vlc_tables_size[i];
1929         }
1930         /*
1931          * This is a one time safety check to make sure that
1932          * the packed static coeff_token_vlc table sizes
1933          * were initialized correctly.
1934          */
1935         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1936
1937         for(i=0; i<3; i++){
1938             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1939             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1940             init_vlc(&chroma_dc_total_zeros_vlc[i],
1941                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1942                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1943                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1944                      INIT_VLC_USE_NEW_STATIC);
1945         }
1946         for(i=0; i<15; i++){
1947             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1948             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1949             init_vlc(&total_zeros_vlc[i],
1950                      TOTAL_ZEROS_VLC_BITS, 16,
1951                      &total_zeros_len [i][0], 1, 1,
1952                      &total_zeros_bits[i][0], 1, 1,
1953                      INIT_VLC_USE_NEW_STATIC);
1954         }
1955
1956         for(i=0; i<6; i++){
1957             run_vlc[i].table = run_vlc_tables[i];
1958             run_vlc[i].table_allocated = run_vlc_tables_size;
1959             init_vlc(&run_vlc[i],
1960                      RUN_VLC_BITS, 7,
1961                      &run_len [i][0], 1, 1,
1962                      &run_bits[i][0], 1, 1,
1963                      INIT_VLC_USE_NEW_STATIC);
1964         }
1965         run7_vlc.table = run7_vlc_table,
1966         run7_vlc.table_allocated = run7_vlc_table_size;
1967         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1968                  &run_len [6][0], 1, 1,
1969                  &run_bits[6][0], 1, 1,
1970                  INIT_VLC_USE_NEW_STATIC);
1971
1972         init_cavlc_level_tab();
1973     }
1974 }
1975
1976 static void free_tables(H264Context *h){
1977     int i;
1978     H264Context *hx;
1979     av_freep(&h->intra4x4_pred_mode);
1980     av_freep(&h->chroma_pred_mode_table);
1981     av_freep(&h->cbp_table);
1982     av_freep(&h->mvd_table[0]);
1983     av_freep(&h->mvd_table[1]);
1984     av_freep(&h->direct_table);
1985     av_freep(&h->non_zero_count);
1986     av_freep(&h->slice_table_base);
1987     h->slice_table= NULL;
1988
1989     av_freep(&h->mb2b_xy);
1990     av_freep(&h->mb2b8_xy);
1991
1992     for(i = 0; i < h->s.avctx->thread_count; i++) {
1993         hx = h->thread_context[i];
1994         if(!hx) continue;
1995         av_freep(&hx->top_borders[1]);
1996         av_freep(&hx->top_borders[0]);
1997         av_freep(&hx->s.obmc_scratchpad);
1998     }
1999 }
2000
2001 static void init_dequant8_coeff_table(H264Context *h){
2002     int i,q,x;
2003     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2004     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2005     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2006
2007     for(i=0; i<2; i++ ){
2008         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2009             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2010             break;
2011         }
2012
2013         for(q=0; q<52; q++){
2014             int shift = div6[q];
2015             int idx = rem6[q];
2016             for(x=0; x<64; x++)
2017                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2018                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2019                     h->pps.scaling_matrix8[i][x]) << shift;
2020         }
2021     }
2022 }
2023
2024 static void init_dequant4_coeff_table(H264Context *h){
2025     int i,j,q,x;
2026     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2027     for(i=0; i<6; i++ ){
2028         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2029         for(j=0; j<i; j++){
2030             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2031                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2032                 break;
2033             }
2034         }
2035         if(j<i)
2036             continue;
2037
2038         for(q=0; q<52; q++){
2039             int shift = div6[q] + 2;
2040             int idx = rem6[q];
2041             for(x=0; x<16; x++)
2042                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2043                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2044                     h->pps.scaling_matrix4[i][x]) << shift;
2045         }
2046     }
2047 }
2048
2049 static void init_dequant_tables(H264Context *h){
2050     int i,x;
2051     init_dequant4_coeff_table(h);
2052     if(h->pps.transform_8x8_mode)
2053         init_dequant8_coeff_table(h);
2054     if(h->sps.transform_bypass){
2055         for(i=0; i<6; i++)
2056             for(x=0; x<16; x++)
2057                 h->dequant4_coeff[i][0][x] = 1<<6;
2058         if(h->pps.transform_8x8_mode)
2059             for(i=0; i<2; i++)
2060                 for(x=0; x<64; x++)
2061                     h->dequant8_coeff[i][0][x] = 1<<6;
2062     }
2063 }
2064
2065
2066 /**
2067  * allocates tables.
2068  * needs width/height
2069  */
2070 static int alloc_tables(H264Context *h){
2071     MpegEncContext * const s = &h->s;
2072     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2073     int x,y;
2074
2075     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2076
2077     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2078     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2079     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2080
2081     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2082     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2083     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2084     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2085
2086     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2087     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2088
2089     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2090     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2091     for(y=0; y<s->mb_height; y++){
2092         for(x=0; x<s->mb_width; x++){
2093             const int mb_xy= x + y*s->mb_stride;
2094             const int b_xy = 4*x + 4*y*h->b_stride;
2095             const int b8_xy= 2*x + 2*y*h->b8_stride;
2096
2097             h->mb2b_xy [mb_xy]= b_xy;
2098             h->mb2b8_xy[mb_xy]= b8_xy;
2099         }
2100     }
2101
2102     s->obmc_scratchpad = NULL;
2103
2104     if(!h->dequant4_coeff[0])
2105         init_dequant_tables(h);
2106
2107     return 0;
2108 fail:
2109     free_tables(h);
2110     return -1;
2111 }
2112
2113 /**
2114  * Mimic alloc_tables(), but for every context thread.
2115  */
2116 static void clone_tables(H264Context *dst, H264Context *src){
2117     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2118     dst->non_zero_count           = src->non_zero_count;
2119     dst->slice_table              = src->slice_table;
2120     dst->cbp_table                = src->cbp_table;
2121     dst->mb2b_xy                  = src->mb2b_xy;
2122     dst->mb2b8_xy                 = src->mb2b8_xy;
2123     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2124     dst->mvd_table[0]             = src->mvd_table[0];
2125     dst->mvd_table[1]             = src->mvd_table[1];
2126     dst->direct_table             = src->direct_table;
2127
2128     dst->s.obmc_scratchpad = NULL;
2129     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2130 }
2131
2132 /**
2133  * Init context
2134  * Allocate buffers which are not shared amongst multiple threads.
2135  */
2136 static int context_init(H264Context *h){
2137     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2138     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2139
2140     return 0;
2141 fail:
2142     return -1; // free_tables will clean up for us
2143 }
2144
2145 static av_cold void common_init(H264Context *h){
2146     MpegEncContext * const s = &h->s;
2147
2148     s->width = s->avctx->width;
2149     s->height = s->avctx->height;
2150     s->codec_id= s->avctx->codec->id;
2151
2152     ff_h264_pred_init(&h->hpc, s->codec_id);
2153
2154     h->dequant_coeff_pps= -1;
2155     s->unrestricted_mv=1;
2156     s->decode=1; //FIXME
2157
2158     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2159
2160     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2161     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2162 }
2163
2164 static av_cold int decode_init(AVCodecContext *avctx){
2165     H264Context *h= avctx->priv_data;
2166     MpegEncContext * const s = &h->s;
2167
2168     MPV_decode_defaults(s);
2169
2170     s->avctx = avctx;
2171     common_init(h);
2172
2173     s->out_format = FMT_H264;
2174     s->workaround_bugs= avctx->workaround_bugs;
2175
2176     // set defaults
2177 //    s->decode_mb= ff_h263_decode_mb;
2178     s->quarter_sample = 1;
2179     s->low_delay= 1;
2180
2181     if(avctx->codec_id == CODEC_ID_SVQ3)
2182         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2183     else if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2184         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2185     else
2186         avctx->pix_fmt= PIX_FMT_YUV420P;
2187
2188     decode_init_vlc();
2189
2190     if(avctx->extradata_size > 0 && avctx->extradata &&
2191        *(char *)avctx->extradata == 1){
2192         h->is_avc = 1;
2193         h->got_avcC = 0;
2194     } else {
2195         h->is_avc = 0;
2196     }
2197
2198     h->thread_context[0] = h;
2199     h->outputed_poc = INT_MIN;
2200     h->prev_poc_msb= 1<<16;
2201     h->sei_recovery_frame_cnt = -1;
2202     h->sei_dpb_output_delay = 0;
2203     h->sei_cpb_removal_delay = -1;
2204     h->sei_buffering_period_present = 0;
2205     return 0;
2206 }
2207
2208 static int frame_start(H264Context *h){
2209     MpegEncContext * const s = &h->s;
2210     int i;
2211
2212     if(MPV_frame_start(s, s->avctx) < 0)
2213         return -1;
2214     ff_er_frame_start(s);
2215     /*
2216      * MPV_frame_start uses pict_type to derive key_frame.
2217      * This is incorrect for H.264; IDR markings must be used.
2218      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2219      * See decode_nal_units().
2220      */
2221     s->current_picture_ptr->key_frame= 0;
2222
2223     assert(s->linesize && s->uvlinesize);
2224
2225     for(i=0; i<16; i++){
2226         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2227         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2228     }
2229     for(i=0; i<4; i++){
2230         h->block_offset[16+i]=
2231         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2232         h->block_offset[24+16+i]=
2233         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2234     }
2235
2236     /* can't be in alloc_tables because linesize isn't known there.
2237      * FIXME: redo bipred weight to not require extra buffer? */
2238     for(i = 0; i < s->avctx->thread_count; i++)
2239         if(!h->thread_context[i]->s.obmc_scratchpad)
2240             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2241
2242     /* some macroblocks will be accessed before they're available */
2243     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2244         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2245
2246 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2247
2248     // We mark the current picture as non-reference after allocating it, so
2249     // that if we break out due to an error it can be released automatically
2250     // in the next MPV_frame_start().
2251     // SVQ3 as well as most other codecs have only last/next/current and thus
2252     // get released even with set reference, besides SVQ3 and others do not
2253     // mark frames as reference later "naturally".
2254     if(s->codec_id != CODEC_ID_SVQ3)
2255         s->current_picture_ptr->reference= 0;
2256
2257     s->current_picture_ptr->field_poc[0]=
2258     s->current_picture_ptr->field_poc[1]= INT_MAX;
2259     assert(s->current_picture_ptr->long_ref==0);
2260
2261     return 0;
2262 }
2263
2264 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2265     MpegEncContext * const s = &h->s;
2266     int i;
2267     int step    = 1;
2268     int offset  = 1;
2269     int uvoffset= 1;
2270     int top_idx = 1;
2271     int skiplast= 0;
2272
2273     src_y  -=   linesize;
2274     src_cb -= uvlinesize;
2275     src_cr -= uvlinesize;
2276
2277     if(!simple && FRAME_MBAFF){
2278         if(s->mb_y&1){
2279             offset  = MB_MBAFF ? 1 : 17;
2280             uvoffset= MB_MBAFF ? 1 : 9;
2281             if(!MB_MBAFF){
2282                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2283                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2284                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2285                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2286                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2287                 }
2288             }
2289         }else{
2290             if(!MB_MBAFF){
2291                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2292                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2293                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2294                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2295                 }
2296                 skiplast= 1;
2297             }
2298             offset  =
2299             uvoffset=
2300             top_idx = MB_MBAFF ? 0 : 1;
2301         }
2302         step= MB_MBAFF ? 2 : 1;
2303     }
2304
2305     // There are two lines saved, the line above the the top macroblock of a pair,
2306     // and the line above the bottom macroblock
2307     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2308     for(i=1; i<17 - skiplast; i++){
2309         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2310     }
2311
2312     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2313     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2314
2315     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2316         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2317         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2318         for(i=1; i<9 - skiplast; i++){
2319             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2320             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2321         }
2322         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2323         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2324     }
2325 }
2326
2327 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2328     MpegEncContext * const s = &h->s;
2329     int temp8, i;
2330     uint64_t temp64;
2331     int deblock_left;
2332     int deblock_top;
2333     int mb_xy;
2334     int step    = 1;
2335     int offset  = 1;
2336     int uvoffset= 1;
2337     int top_idx = 1;
2338
2339     if(!simple && FRAME_MBAFF){
2340         if(s->mb_y&1){
2341             offset  = MB_MBAFF ? 1 : 17;
2342             uvoffset= MB_MBAFF ? 1 : 9;
2343         }else{
2344             offset  =
2345             uvoffset=
2346             top_idx = MB_MBAFF ? 0 : 1;
2347         }
2348         step= MB_MBAFF ? 2 : 1;
2349     }
2350
2351     if(h->deblocking_filter == 2) {
2352         mb_xy = h->mb_xy;
2353         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2354         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2355     } else {
2356         deblock_left = (s->mb_x > 0);
2357         deblock_top =  (s->mb_y > !!MB_FIELD);
2358     }
2359
2360     src_y  -=   linesize + 1;
2361     src_cb -= uvlinesize + 1;
2362     src_cr -= uvlinesize + 1;
2363
2364 #define XCHG(a,b,t,xchg)\
2365 t= a;\
2366 if(xchg)\
2367     a= b;\
2368 b= t;
2369
2370     if(deblock_left){
2371         for(i = !deblock_top; i<16; i++){
2372             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2373         }
2374         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2375     }
2376
2377     if(deblock_top){
2378         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2379         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2380         if(s->mb_x+1 < s->mb_width){
2381             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2382         }
2383     }
2384
2385     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2386         if(deblock_left){
2387             for(i = !deblock_top; i<8; i++){
2388                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2389                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2390             }
2391             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2392             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2393         }
2394         if(deblock_top){
2395             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2396             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2397         }
2398     }
2399 }
2400
2401 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2402     MpegEncContext * const s = &h->s;
2403     const int mb_x= s->mb_x;
2404     const int mb_y= s->mb_y;
2405     const int mb_xy= h->mb_xy;
2406     const int mb_type= s->current_picture.mb_type[mb_xy];
2407     uint8_t  *dest_y, *dest_cb, *dest_cr;
2408     int linesize, uvlinesize /*dct_offset*/;
2409     int i;
2410     int *block_offset = &h->block_offset[0];
2411     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2412     /* is_h264 should always be true if SVQ3 is disabled. */
2413     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2414     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2415     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2416
2417     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2418     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2419     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2420
2421     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2422     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2423
2424     if (!simple && MB_FIELD) {
2425         linesize   = h->mb_linesize   = s->linesize * 2;
2426         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2427         block_offset = &h->block_offset[24];
2428         if(mb_y&1){ //FIXME move out of this function?
2429             dest_y -= s->linesize*15;
2430             dest_cb-= s->uvlinesize*7;
2431             dest_cr-= s->uvlinesize*7;
2432         }
2433         if(FRAME_MBAFF) {
2434             int list;
2435             for(list=0; list<h->list_count; list++){
2436                 if(!USES_LIST(mb_type, list))
2437                     continue;
2438                 if(IS_16X16(mb_type)){
2439                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2440                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2441                 }else{
2442                     for(i=0; i<16; i+=4){
2443                         int ref = h->ref_cache[list][scan8[i]];
2444                         if(ref >= 0)
2445                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2446                     }
2447                 }
2448             }
2449         }
2450     } else {
2451         linesize   = h->mb_linesize   = s->linesize;
2452         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2453 //        dct_offset = s->linesize * 16;
2454     }
2455
2456     if (!simple && IS_INTRA_PCM(mb_type)) {
2457         for (i=0; i<16; i++) {
2458             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2459         }
2460         for (i=0; i<8; i++) {
2461             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2462             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2463         }
2464     } else {
2465         if(IS_INTRA(mb_type)){
2466             if(h->deblocking_filter)
2467                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2468
2469             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2470                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2471                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2472             }
2473
2474             if(IS_INTRA4x4(mb_type)){
2475                 if(simple || !s->encoding){
2476                     if(IS_8x8DCT(mb_type)){
2477                         if(transform_bypass){
2478                             idct_dc_add =
2479                             idct_add    = s->dsp.add_pixels8;
2480                         }else{
2481                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2482                             idct_add    = s->dsp.h264_idct8_add;
2483                         }
2484                         for(i=0; i<16; i+=4){
2485                             uint8_t * const ptr= dest_y + block_offset[i];
2486                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2487                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2488                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2489                             }else{
2490                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2491                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2492                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2493                                 if(nnz){
2494                                     if(nnz == 1 && h->mb[i*16])
2495                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2496                                     else
2497                                         idct_add   (ptr, h->mb + i*16, linesize);
2498                                 }
2499                             }
2500                         }
2501                     }else{
2502                         if(transform_bypass){
2503                             idct_dc_add =
2504                             idct_add    = s->dsp.add_pixels4;
2505                         }else{
2506                             idct_dc_add = s->dsp.h264_idct_dc_add;
2507                             idct_add    = s->dsp.h264_idct_add;
2508                         }
2509                         for(i=0; i<16; i++){
2510                             uint8_t * const ptr= dest_y + block_offset[i];
2511                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2512
2513                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2514                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2515                             }else{
2516                                 uint8_t *topright;
2517                                 int nnz, tr;
2518                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2519                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2520                                     assert(mb_y || linesize <= block_offset[i]);
2521                                     if(!topright_avail){
2522                                         tr= ptr[3 - linesize]*0x01010101;
2523                                         topright= (uint8_t*) &tr;
2524                                     }else
2525                                         topright= ptr + 4 - linesize;
2526                                 }else
2527                                     topright= NULL;
2528
2529                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2530                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2531                                 if(nnz){
2532                                     if(is_h264){
2533                                         if(nnz == 1 && h->mb[i*16])
2534                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2535                                         else
2536                                             idct_add   (ptr, h->mb + i*16, linesize);
2537                                     }else
2538                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2539                                 }
2540                             }
2541                         }
2542                     }
2543                 }
2544             }else{
2545                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2546                 if(is_h264){
2547                     if(!transform_bypass)
2548                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2549                 }else
2550                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2551             }
2552             if(h->deblocking_filter)
2553                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2554         }else if(is_h264){
2555             hl_motion(h, dest_y, dest_cb, dest_cr,
2556                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2557                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2558                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2559         }
2560
2561
2562         if(!IS_INTRA4x4(mb_type)){
2563             if(is_h264){
2564                 if(IS_INTRA16x16(mb_type)){
2565                     if(transform_bypass){
2566                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2567                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2568                         }else{
2569                             for(i=0; i<16; i++){
2570                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2571                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2572                             }
2573                         }
2574                     }else{
2575                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2576                     }
2577                 }else if(h->cbp&15){
2578                     if(transform_bypass){
2579                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2580                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2581                         for(i=0; i<16; i+=di){
2582                             if(h->non_zero_count_cache[ scan8[i] ]){
2583                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2584                             }
2585                         }
2586                     }else{
2587                         if(IS_8x8DCT(mb_type)){
2588                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2589                         }else{
2590                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2591                         }
2592                     }
2593                 }
2594             }else{
2595                 for(i=0; i<16; i++){
2596                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2597                         uint8_t * const ptr= dest_y + block_offset[i];
2598                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2599                     }
2600                 }
2601             }
2602         }
2603
2604         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2605             uint8_t *dest[2] = {dest_cb, dest_cr};
2606             if(transform_bypass){
2607                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2608                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2609                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2610                 }else{
2611                     idct_add = s->dsp.add_pixels4;
2612                     for(i=16; i<16+8; i++){
2613                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2614                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2615                     }
2616                 }
2617             }else{
2618                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2619                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2620                 if(is_h264){
2621                     idct_add = s->dsp.h264_idct_add;
2622                     idct_dc_add = s->dsp.h264_idct_dc_add;
2623                     for(i=16; i<16+8; i++){
2624                         if(h->non_zero_count_cache[ scan8[i] ])
2625                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2626                         else if(h->mb[i*16])
2627                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2628                     }
2629                 }else{
2630                     for(i=16; i<16+8; i++){
2631                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2632                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2633                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2634                         }
2635                     }
2636                 }
2637             }
2638         }
2639     }
2640     if(h->cbp || IS_INTRA(mb_type))
2641         s->dsp.clear_blocks(h->mb);
2642
2643     if(h->deblocking_filter) {
2644         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2645         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2646         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2647         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2648         if (!simple && FRAME_MBAFF) {
2649             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2650         } else {
2651             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2652         }
2653     }
2654 }
2655
2656 /**
2657  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2658  */
2659 static void hl_decode_mb_simple(H264Context *h){
2660     hl_decode_mb_internal(h, 1);
2661 }
2662
2663 /**
2664  * Process a macroblock; this handles edge cases, such as interlacing.
2665  */
2666 static void av_noinline hl_decode_mb_complex(H264Context *h){
2667     hl_decode_mb_internal(h, 0);
2668 }
2669
2670 static void hl_decode_mb(H264Context *h){
2671     MpegEncContext * const s = &h->s;
2672     const int mb_xy= h->mb_xy;
2673     const int mb_type= s->current_picture.mb_type[mb_xy];
2674     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2675
2676     if (is_complex)
2677         hl_decode_mb_complex(h);
2678     else hl_decode_mb_simple(h);
2679 }
2680
2681 static void pic_as_field(Picture *pic, const int parity){
2682     int i;
2683     for (i = 0; i < 4; ++i) {
2684         if (parity == PICT_BOTTOM_FIELD)
2685             pic->data[i] += pic->linesize[i];
2686         pic->reference = parity;
2687         pic->linesize[i] *= 2;
2688     }
2689     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2690 }
2691
2692 static int split_field_copy(Picture *dest, Picture *src,
2693                             int parity, int id_add){
2694     int match = !!(src->reference & parity);
2695
2696     if (match) {
2697         *dest = *src;
2698         if(parity != PICT_FRAME){
2699             pic_as_field(dest, parity);
2700             dest->pic_id *= 2;
2701             dest->pic_id += id_add;
2702         }
2703     }
2704
2705     return match;
2706 }
2707
2708 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2709     int i[2]={0};
2710     int index=0;
2711
2712     while(i[0]<len || i[1]<len){
2713         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2714             i[0]++;
2715         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2716             i[1]++;
2717         if(i[0] < len){
2718             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2719             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2720         }
2721         if(i[1] < len){
2722             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2723             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2724         }
2725     }
2726
2727     return index;
2728 }
2729
2730 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2731     int i, best_poc;
2732     int out_i= 0;
2733
2734     for(;;){
2735         best_poc= dir ? INT_MIN : INT_MAX;
2736
2737         for(i=0; i<len; i++){
2738             const int poc= src[i]->poc;
2739             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2740                 best_poc= poc;
2741                 sorted[out_i]= src[i];
2742             }
2743         }
2744         if(best_poc == (dir ? INT_MIN : INT_MAX))
2745             break;
2746         limit= sorted[out_i++]->poc - dir;
2747     }
2748     return out_i;
2749 }
2750
2751 /**
2752  * fills the default_ref_list.
2753  */
2754 static int fill_default_ref_list(H264Context *h){
2755     MpegEncContext * const s = &h->s;
2756     int i, len;
2757
2758     if(h->slice_type_nos==FF_B_TYPE){
2759         Picture *sorted[32];
2760         int cur_poc, list;
2761         int lens[2];
2762
2763         if(FIELD_PICTURE)
2764             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2765         else
2766             cur_poc= s->current_picture_ptr->poc;
2767
2768         for(list= 0; list<2; list++){
2769             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2770             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2771             assert(len<=32);
2772             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2773             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2774             assert(len<=32);
2775
2776             if(len < h->ref_count[list])
2777                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2778             lens[list]= len;
2779         }
2780
2781         if(lens[0] == lens[1] && lens[1] > 1){
2782             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2783             if(i == lens[0])
2784                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2785         }
2786     }else{
2787         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2788         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2789         assert(len <= 32);
2790         if(len < h->ref_count[0])
2791             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2792     }
2793 #ifdef TRACE
2794     for (i=0; i<h->ref_count[0]; i++) {
2795         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2796     }
2797     if(h->slice_type_nos==FF_B_TYPE){
2798         for (i=0; i<h->ref_count[1]; i++) {
2799             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2800         }
2801     }
2802 #endif
2803     return 0;
2804 }
2805
2806 static void print_short_term(H264Context *h);
2807 static void print_long_term(H264Context *h);
2808
2809 /**
2810  * Extract structure information about the picture described by pic_num in
2811  * the current decoding context (frame or field). Note that pic_num is
2812  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2813  * @param pic_num picture number for which to extract structure information
2814  * @param structure one of PICT_XXX describing structure of picture
2815  *                      with pic_num
2816  * @return frame number (short term) or long term index of picture
2817  *         described by pic_num
2818  */
2819 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2820     MpegEncContext * const s = &h->s;
2821
2822     *structure = s->picture_structure;
2823     if(FIELD_PICTURE){
2824         if (!(pic_num & 1))
2825             /* opposite field */
2826             *structure ^= PICT_FRAME;
2827         pic_num >>= 1;
2828     }
2829
2830     return pic_num;
2831 }
2832
2833 static int decode_ref_pic_list_reordering(H264Context *h){
2834     MpegEncContext * const s = &h->s;
2835     int list, index, pic_structure;
2836
2837     print_short_term(h);
2838     print_long_term(h);
2839
2840     for(list=0; list<h->list_count; list++){
2841         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2842
2843         if(get_bits1(&s->gb)){
2844             int pred= h->curr_pic_num;
2845
2846             for(index=0; ; index++){
2847                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2848                 unsigned int pic_id;
2849                 int i;
2850                 Picture *ref = NULL;
2851
2852                 if(reordering_of_pic_nums_idc==3)
2853                     break;
2854
2855                 if(index >= h->ref_count[list]){
2856                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2857                     return -1;
2858                 }
2859
2860                 if(reordering_of_pic_nums_idc<3){
2861                     if(reordering_of_pic_nums_idc<2){
2862                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2863                         int frame_num;
2864
2865                         if(abs_diff_pic_num > h->max_pic_num){
2866                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2867                             return -1;
2868                         }
2869
2870                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2871                         else                                pred+= abs_diff_pic_num;
2872                         pred &= h->max_pic_num - 1;
2873
2874                         frame_num = pic_num_extract(h, pred, &pic_structure);
2875
2876                         for(i= h->short_ref_count-1; i>=0; i--){
2877                             ref = h->short_ref[i];
2878                             assert(ref->reference);
2879                             assert(!ref->long_ref);
2880                             if(
2881                                    ref->frame_num == frame_num &&
2882                                    (ref->reference & pic_structure)
2883                               )
2884                                 break;
2885                         }
2886                         if(i>=0)
2887                             ref->pic_id= pred;
2888                     }else{
2889                         int long_idx;
2890                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2891
2892                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2893
2894                         if(long_idx>31){
2895                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2896                             return -1;
2897                         }
2898                         ref = h->long_ref[long_idx];
2899                         assert(!(ref && !ref->reference));
2900                         if(ref && (ref->reference & pic_structure)){
2901                             ref->pic_id= pic_id;
2902                             assert(ref->long_ref);
2903                             i=0;
2904                         }else{
2905                             i=-1;
2906                         }
2907                     }
2908
2909                     if (i < 0) {
2910                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2911                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2912                     } else {
2913                         for(i=index; i+1<h->ref_count[list]; i++){
2914                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2915                                 break;
2916                         }
2917                         for(; i > index; i--){
2918                             h->ref_list[list][i]= h->ref_list[list][i-1];
2919                         }
2920                         h->ref_list[list][index]= *ref;
2921                         if (FIELD_PICTURE){
2922                             pic_as_field(&h->ref_list[list][index], pic_structure);
2923                         }
2924                     }
2925                 }else{
2926                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2927                     return -1;
2928                 }
2929             }
2930         }
2931     }
2932     for(list=0; list<h->list_count; list++){
2933         for(index= 0; index < h->ref_count[list]; index++){
2934             if(!h->ref_list[list][index].data[0]){
2935                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2936                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2937             }
2938         }
2939     }
2940
2941     return 0;
2942 }
2943
2944 static void fill_mbaff_ref_list(H264Context *h){
2945     int list, i, j;
2946     for(list=0; list<2; list++){ //FIXME try list_count
2947         for(i=0; i<h->ref_count[list]; i++){
2948             Picture *frame = &h->ref_list[list][i];
2949             Picture *field = &h->ref_list[list][16+2*i];
2950             field[0] = *frame;
2951             for(j=0; j<3; j++)
2952                 field[0].linesize[j] <<= 1;
2953             field[0].reference = PICT_TOP_FIELD;
2954             field[0].poc= field[0].field_poc[0];
2955             field[1] = field[0];
2956             for(j=0; j<3; j++)
2957                 field[1].data[j] += frame->linesize[j];
2958             field[1].reference = PICT_BOTTOM_FIELD;
2959             field[1].poc= field[1].field_poc[1];
2960
2961             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2962             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2963             for(j=0; j<2; j++){
2964                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2965                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2966             }
2967         }
2968     }
2969     for(j=0; j<h->ref_count[1]; j++){
2970         for(i=0; i<h->ref_count[0]; i++)
2971             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2972         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2973         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2974     }
2975 }
2976
2977 static int pred_weight_table(H264Context *h){
2978     MpegEncContext * const s = &h->s;
2979     int list, i;
2980     int luma_def, chroma_def;
2981
2982     h->use_weight= 0;
2983     h->use_weight_chroma= 0;
2984     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2985     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2986     luma_def = 1<<h->luma_log2_weight_denom;
2987     chroma_def = 1<<h->chroma_log2_weight_denom;
2988
2989     for(list=0; list<2; list++){
2990         h->luma_weight_flag[list]   = 0;
2991         h->chroma_weight_flag[list] = 0;
2992         for(i=0; i<h->ref_count[list]; i++){
2993             int luma_weight_flag, chroma_weight_flag;
2994
2995             luma_weight_flag= get_bits1(&s->gb);
2996             if(luma_weight_flag){
2997                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2998                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2999                 if(   h->luma_weight[list][i] != luma_def
3000                    || h->luma_offset[list][i] != 0) {
3001                     h->use_weight= 1;
3002                     h->luma_weight_flag[list]= 1;
3003                 }
3004             }else{
3005                 h->luma_weight[list][i]= luma_def;
3006                 h->luma_offset[list][i]= 0;
3007             }
3008
3009             if(CHROMA){
3010                 chroma_weight_flag= get_bits1(&s->gb);
3011                 if(chroma_weight_flag){
3012                     int j;
3013                     for(j=0; j<2; j++){
3014                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3015                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3016                         if(   h->chroma_weight[list][i][j] != chroma_def
3017                            || h->chroma_offset[list][i][j] != 0) {
3018                             h->use_weight_chroma= 1;
3019                             h->chroma_weight_flag[list]= 1;
3020                         }
3021                     }
3022                 }else{
3023                     int j;
3024                     for(j=0; j<2; j++){
3025                         h->chroma_weight[list][i][j]= chroma_def;
3026                         h->chroma_offset[list][i][j]= 0;
3027                     }
3028                 }
3029             }
3030         }
3031         if(h->slice_type_nos != FF_B_TYPE) break;
3032     }
3033     h->use_weight= h->use_weight || h->use_weight_chroma;
3034     return 0;
3035 }
3036
3037 static void implicit_weight_table(H264Context *h){
3038     MpegEncContext * const s = &h->s;
3039     int ref0, ref1, i;
3040     int cur_poc = s->current_picture_ptr->poc;
3041
3042     for (i = 0; i < 2; i++) {
3043         h->luma_weight_flag[i]   = 0;
3044         h->chroma_weight_flag[i] = 0;
3045     }
3046
3047     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3048        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3049         h->use_weight= 0;
3050         h->use_weight_chroma= 0;
3051         return;
3052     }
3053
3054     h->use_weight= 2;
3055     h->use_weight_chroma= 2;
3056     h->luma_log2_weight_denom= 5;
3057     h->chroma_log2_weight_denom= 5;
3058
3059     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3060         int poc0 = h->ref_list[0][ref0].poc;
3061         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3062             int poc1 = h->ref_list[1][ref1].poc;
3063             int td = av_clip(poc1 - poc0, -128, 127);
3064             if(td){
3065                 int tb = av_clip(cur_poc - poc0, -128, 127);
3066                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3067                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3068                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3069                     h->implicit_weight[ref0][ref1] = 32;
3070                 else
3071                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3072             }else
3073                 h->implicit_weight[ref0][ref1] = 32;
3074         }
3075     }
3076 }
3077
3078 /**
3079  * Mark a picture as no longer needed for reference. The refmask
3080  * argument allows unreferencing of individual fields or the whole frame.
3081  * If the picture becomes entirely unreferenced, but is being held for
3082  * display purposes, it is marked as such.
3083  * @param refmask mask of fields to unreference; the mask is bitwise
3084  *                anded with the reference marking of pic
3085  * @return non-zero if pic becomes entirely unreferenced (except possibly
3086  *         for display purposes) zero if one of the fields remains in
3087  *         reference
3088  */
3089 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3090     int i;
3091     if (pic->reference &= refmask) {
3092         return 0;
3093     } else {
3094         for(i = 0; h->delayed_pic[i]; i++)
3095             if(pic == h->delayed_pic[i]){
3096                 pic->reference=DELAYED_PIC_REF;
3097                 break;
3098             }
3099         return 1;
3100     }
3101 }
3102
3103 /**
3104  * instantaneous decoder refresh.
3105  */
3106 static void idr(H264Context *h){
3107     int i;
3108
3109     for(i=0; i<16; i++){
3110         remove_long(h, i, 0);
3111     }
3112     assert(h->long_ref_count==0);
3113
3114     for(i=0; i<h->short_ref_count; i++){
3115         unreference_pic(h, h->short_ref[i], 0);
3116         h->short_ref[i]= NULL;
3117     }
3118     h->short_ref_count=0;
3119     h->prev_frame_num= 0;
3120     h->prev_frame_num_offset= 0;
3121     h->prev_poc_msb=
3122     h->prev_poc_lsb= 0;
3123 }
3124
3125 /* forget old pics after a seek */
3126 static void flush_dpb(AVCodecContext *avctx){
3127     H264Context *h= avctx->priv_data;
3128     int i;
3129     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3130         if(h->delayed_pic[i])
3131             h->delayed_pic[i]->reference= 0;
3132         h->delayed_pic[i]= NULL;
3133     }
3134     h->outputed_poc= INT_MIN;
3135     idr(h);
3136     if(h->s.current_picture_ptr)
3137         h->s.current_picture_ptr->reference= 0;
3138     h->s.first_field= 0;
3139     h->sei_recovery_frame_cnt = -1;
3140     h->sei_dpb_output_delay = 0;
3141     h->sei_cpb_removal_delay = -1;
3142     h->sei_buffering_period_present = 0;
3143     ff_mpeg_flush(avctx);
3144 }
3145
3146 /**
3147  * Find a Picture in the short term reference list by frame number.
3148  * @param frame_num frame number to search for
3149  * @param idx the index into h->short_ref where returned picture is found
3150  *            undefined if no picture found.
3151  * @return pointer to the found picture, or NULL if no pic with the provided
3152  *                 frame number is found
3153  */
3154 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3155     MpegEncContext * const s = &h->s;
3156     int i;
3157
3158     for(i=0; i<h->short_ref_count; i++){
3159         Picture *pic= h->short_ref[i];
3160         if(s->avctx->debug&FF_DEBUG_MMCO)
3161             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3162         if(pic->frame_num == frame_num) {
3163             *idx = i;
3164             return pic;
3165         }
3166     }
3167     return NULL;
3168 }
3169
3170 /**
3171  * Remove a picture from the short term reference list by its index in
3172  * that list.  This does no checking on the provided index; it is assumed
3173  * to be valid. Other list entries are shifted down.
3174  * @param i index into h->short_ref of picture to remove.
3175  */
3176 static void remove_short_at_index(H264Context *h, int i){
3177     assert(i >= 0 && i < h->short_ref_count);
3178     h->short_ref[i]= NULL;
3179     if (--h->short_ref_count)
3180         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3181 }
3182
3183 /**
3184  *
3185  * @return the removed picture or NULL if an error occurs
3186  */
3187 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3188     MpegEncContext * const s = &h->s;
3189     Picture *pic;
3190     int i;
3191
3192     if(s->avctx->debug&FF_DEBUG_MMCO)
3193         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3194
3195     pic = find_short(h, frame_num, &i);
3196     if (pic){
3197         if(unreference_pic(h, pic, ref_mask))
3198         remove_short_at_index(h, i);
3199     }
3200
3201     return pic;
3202 }
3203
3204 /**
3205  * Remove a picture from the long term reference list by its index in
3206  * that list.
3207  * @return the removed picture or NULL if an error occurs
3208  */
3209 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3210     Picture *pic;
3211
3212     pic= h->long_ref[i];
3213     if (pic){
3214         if(unreference_pic(h, pic, ref_mask)){
3215             assert(h->long_ref[i]->long_ref == 1);
3216             h->long_ref[i]->long_ref= 0;
3217             h->long_ref[i]= NULL;
3218             h->long_ref_count--;
3219         }
3220     }
3221
3222     return pic;
3223 }
3224
3225 /**
3226  * print short term list
3227  */
3228 static void print_short_term(H264Context *h) {
3229     uint32_t i;
3230     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3231         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3232         for(i=0; i<h->short_ref_count; i++){
3233             Picture *pic= h->short_ref[i];
3234             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3235         }
3236     }
3237 }
3238
3239 /**
3240  * print long term list
3241  */
3242 static void print_long_term(H264Context *h) {
3243     uint32_t i;
3244     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3245         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3246         for(i = 0; i < 16; i++){
3247             Picture *pic= h->long_ref[i];
3248             if (pic) {
3249                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3250             }
3251         }
3252     }
3253 }
3254
3255 /**
3256  * Executes the reference picture marking (memory management control operations).
3257  */
3258 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3259     MpegEncContext * const s = &h->s;
3260     int i, j;
3261     int current_ref_assigned=0;
3262     Picture *pic;
3263
3264     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3265         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3266
3267     for(i=0; i<mmco_count; i++){
3268         int structure, frame_num;
3269         if(s->avctx->debug&FF_DEBUG_MMCO)
3270             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3271
3272         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3273            || mmco[i].opcode == MMCO_SHORT2LONG){
3274             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3275             pic = find_short(h, frame_num, &j);
3276             if(!pic){
3277                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3278                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3279                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3280                 continue;
3281             }
3282         }
3283
3284         switch(mmco[i].opcode){
3285         case MMCO_SHORT2UNUSED:
3286             if(s->avctx->debug&FF_DEBUG_MMCO)
3287                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3288             remove_short(h, frame_num, structure ^ PICT_FRAME);
3289             break;
3290         case MMCO_SHORT2LONG:
3291                 if (h->long_ref[mmco[i].long_arg] != pic)
3292                     remove_long(h, mmco[i].long_arg, 0);
3293
3294                 remove_short_at_index(h, j);
3295                 h->long_ref[ mmco[i].long_arg ]= pic;
3296                 if (h->long_ref[ mmco[i].long_arg ]){
3297                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3298                     h->long_ref_count++;
3299                 }
3300             break;
3301         case MMCO_LONG2UNUSED:
3302             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3303             pic = h->long_ref[j];
3304             if (pic) {
3305                 remove_long(h, j, structure ^ PICT_FRAME);
3306             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3307                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3308             break;
3309         case MMCO_LONG:
3310                     // Comment below left from previous code as it is an interresting note.
3311                     /* First field in pair is in short term list or
3312                      * at a different long term index.
3313                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3314                      * Report the problem and keep the pair where it is,
3315                      * and mark this field valid.
3316                      */
3317
3318             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3319                 remove_long(h, mmco[i].long_arg, 0);
3320
3321                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3322                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3323                 h->long_ref_count++;
3324             }
3325
3326             s->current_picture_ptr->reference |= s->picture_structure;
3327             current_ref_assigned=1;
3328             break;
3329         case MMCO_SET_MAX_LONG:
3330             assert(mmco[i].long_arg <= 16);
3331             // just remove the long term which index is greater than new max
3332             for(j = mmco[i].long_arg; j<16; j++){
3333                 remove_long(h, j, 0);
3334             }
3335             break;
3336         case MMCO_RESET:
3337             while(h->short_ref_count){
3338                 remove_short(h, h->short_ref[0]->frame_num, 0);
3339             }
3340             for(j = 0; j < 16; j++) {
3341                 remove_long(h, j, 0);
3342             }
3343             s->current_picture_ptr->poc=
3344             s->current_picture_ptr->field_poc[0]=
3345             s->current_picture_ptr->field_poc[1]=
3346             h->poc_lsb=
3347             h->poc_msb=
3348             h->frame_num=
3349             s->current_picture_ptr->frame_num= 0;
3350             break;
3351         default: assert(0);
3352         }
3353     }
3354
3355     if (!current_ref_assigned) {
3356         /* Second field of complementary field pair; the first field of
3357          * which is already referenced. If short referenced, it
3358          * should be first entry in short_ref. If not, it must exist
3359          * in long_ref; trying to put it on the short list here is an
3360          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3361          */
3362         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3363             /* Just mark the second field valid */
3364             s->current_picture_ptr->reference = PICT_FRAME;
3365         } else if (s->current_picture_ptr->long_ref) {
3366             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3367                                              "assignment for second field "
3368                                              "in complementary field pair "
3369                                              "(first field is long term)\n");
3370         } else {
3371             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3372             if(pic){
3373                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3374             }
3375
3376             if(h->short_ref_count)
3377                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3378
3379             h->short_ref[0]= s->current_picture_ptr;
3380             h->short_ref_count++;
3381             s->current_picture_ptr->reference |= s->picture_structure;
3382         }
3383     }
3384
3385     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3386
3387         /* We have too many reference frames, probably due to corrupted
3388          * stream. Need to discard one frame. Prevents overrun of the
3389          * short_ref and long_ref buffers.
3390          */
3391         av_log(h->s.avctx, AV_LOG_ERROR,
3392                "number of reference frames exceeds max (probably "
3393                "corrupt input), discarding one\n");
3394
3395         if (h->long_ref_count && !h->short_ref_count) {
3396             for (i = 0; i < 16; ++i)
3397                 if (h->long_ref[i])
3398                     break;
3399
3400             assert(i < 16);
3401             remove_long(h, i, 0);
3402         } else {
3403             pic = h->short_ref[h->short_ref_count - 1];
3404             remove_short(h, pic->frame_num, 0);
3405         }
3406     }
3407
3408     print_short_term(h);
3409     print_long_term(h);
3410     return 0;
3411 }
3412
3413 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3414     MpegEncContext * const s = &h->s;
3415     int i;
3416
3417     h->mmco_index= 0;
3418     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3419         s->broken_link= get_bits1(gb) -1;
3420         if(get_bits1(gb)){
3421             h->mmco[0].opcode= MMCO_LONG;
3422             h->mmco[0].long_arg= 0;
3423             h->mmco_index= 1;
3424         }
3425     }else{
3426         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3427             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3428                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3429
3430                 h->mmco[i].opcode= opcode;
3431                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3432                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3433 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3434                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3435                         return -1;
3436                     }*/
3437                 }
3438                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3439                     unsigned int long_arg= get_ue_golomb_31(gb);
3440                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3441                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3442                         return -1;
3443                     }
3444                     h->mmco[i].long_arg= long_arg;
3445                 }
3446
3447                 if(opcode > (unsigned)MMCO_LONG){
3448                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3449                     return -1;
3450                 }
3451                 if(opcode == MMCO_END)
3452                     break;
3453             }
3454             h->mmco_index= i;
3455         }else{
3456             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3457
3458             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3459                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3460                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3461                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3462                 h->mmco_index= 1;
3463                 if (FIELD_PICTURE) {
3464                     h->mmco[0].short_pic_num *= 2;
3465                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3466                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3467                     h->mmco_index= 2;
3468                 }
3469             }
3470         }
3471     }
3472
3473     return 0;
3474 }
3475
3476 static int init_poc(H264Context *h){
3477     MpegEncContext * const s = &h->s;
3478     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3479     int field_poc[2];
3480     Picture *cur = s->current_picture_ptr;
3481
3482     h->frame_num_offset= h->prev_frame_num_offset;
3483     if(h->frame_num < h->prev_frame_num)
3484         h->frame_num_offset += max_frame_num;
3485
3486     if(h->sps.poc_type==0){
3487         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3488
3489         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3490             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3491         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3492             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3493         else
3494             h->poc_msb = h->prev_poc_msb;
3495 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3496         field_poc[0] =
3497         field_poc[1] = h->poc_msb + h->poc_lsb;
3498         if(s->picture_structure == PICT_FRAME)
3499             field_poc[1] += h->delta_poc_bottom;
3500     }else if(h->sps.poc_type==1){
3501         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3502         int i;
3503
3504         if(h->sps.poc_cycle_length != 0)
3505             abs_frame_num = h->frame_num_offset + h->frame_num;
3506         else
3507             abs_frame_num = 0;
3508
3509         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3510             abs_frame_num--;
3511
3512         expected_delta_per_poc_cycle = 0;
3513         for(i=0; i < h->sps.poc_cycle_length; i++)
3514             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3515
3516         if(abs_frame_num > 0){
3517             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3518             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3519
3520             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3521             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3522                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3523         } else
3524             expectedpoc = 0;
3525
3526         if(h->nal_ref_idc == 0)
3527             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3528
3529         field_poc[0] = expectedpoc + h->delta_poc[0];
3530         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3531
3532         if(s->picture_structure == PICT_FRAME)
3533             field_poc[1] += h->delta_poc[1];
3534     }else{
3535         int poc= 2*(h->frame_num_offset + h->frame_num);
3536
3537         if(!h->nal_ref_idc)
3538             poc--;
3539
3540         field_poc[0]= poc;
3541         field_poc[1]= poc;
3542     }
3543
3544     if(s->picture_structure != PICT_BOTTOM_FIELD)
3545         s->current_picture_ptr->field_poc[0]= field_poc[0];
3546     if(s->picture_structure != PICT_TOP_FIELD)
3547         s->current_picture_ptr->field_poc[1]= field_poc[1];
3548     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3549
3550     return 0;
3551 }
3552
3553
3554 /**
3555  * initialize scan tables
3556  */
3557 static void init_scan_tables(H264Context *h){
3558     MpegEncContext * const s = &h->s;
3559     int i;
3560     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3561         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3562         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3563     }else{
3564         for(i=0; i<16; i++){
3565 #define T(x) (x>>2) | ((x<<2) & 0xF)
3566             h->zigzag_scan[i] = T(zigzag_scan[i]);
3567             h-> field_scan[i] = T( field_scan[i]);
3568 #undef T
3569         }
3570     }
3571     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3572         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3573         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3574         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3575         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3576     }else{
3577         for(i=0; i<64; i++){
3578 #define T(x) (x>>3) | ((x&7)<<3)
3579             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3580             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3581             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3582             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3583 #undef T
3584         }
3585     }
3586     if(h->sps.transform_bypass){ //FIXME same ugly
3587         h->zigzag_scan_q0          = zigzag_scan;
3588         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3589         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3590         h->field_scan_q0           = field_scan;
3591         h->field_scan8x8_q0        = field_scan8x8;
3592         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3593     }else{
3594         h->zigzag_scan_q0          = h->zigzag_scan;
3595         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3596         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3597         h->field_scan_q0           = h->field_scan;
3598         h->field_scan8x8_q0        = h->field_scan8x8;
3599         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3600     }
3601 }
3602
3603 /**
3604  * Replicates H264 "master" context to thread contexts.
3605  */
3606 static void clone_slice(H264Context *dst, H264Context *src)
3607 {
3608     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3609     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3610     dst->s.current_picture      = src->s.current_picture;
3611     dst->s.linesize             = src->s.linesize;
3612     dst->s.uvlinesize           = src->s.uvlinesize;
3613     dst->s.first_field          = src->s.first_field;
3614
3615     dst->prev_poc_msb           = src->prev_poc_msb;
3616     dst->prev_poc_lsb           = src->prev_poc_lsb;
3617     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3618     dst->prev_frame_num         = src->prev_frame_num;
3619     dst->short_ref_count        = src->short_ref_count;
3620
3621     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3622     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3623     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3624     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3625
3626     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3627     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3628 }
3629
3630 /**
3631  * decodes a slice header.
3632  * This will also call MPV_common_init() and frame_start() as needed.
3633  *
3634  * @param h h264context
3635  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3636  *
3637  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3638  */
3639 static int decode_slice_header(H264Context *h, H264Context *h0){
3640     MpegEncContext * const s = &h->s;
3641     MpegEncContext * const s0 = &h0->s;
3642     unsigned int first_mb_in_slice;
3643     unsigned int pps_id;
3644     int num_ref_idx_active_override_flag;
3645     unsigned int slice_type, tmp, i, j;
3646     int default_ref_list_done = 0;
3647     int last_pic_structure;
3648
3649     s->dropable= h->nal_ref_idc == 0;
3650
3651     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3652         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3653         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3654     }else{
3655         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3656         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3657     }
3658
3659     first_mb_in_slice= get_ue_golomb(&s->gb);
3660
3661     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3662         h0->current_slice = 0;
3663         if (!s0->first_field)
3664             s->current_picture_ptr= NULL;
3665     }
3666
3667     slice_type= get_ue_golomb_31(&s->gb);
3668     if(slice_type > 9){
3669         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3670         return -1;
3671     }
3672     if(slice_type > 4){
3673         slice_type -= 5;
3674         h->slice_type_fixed=1;
3675     }else
3676         h->slice_type_fixed=0;
3677
3678     slice_type= golomb_to_pict_type[ slice_type ];
3679     if (slice_type == FF_I_TYPE
3680         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3681         default_ref_list_done = 1;
3682     }
3683     h->slice_type= slice_type;
3684     h->slice_type_nos= slice_type & 3;
3685
3686     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3687     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3688         av_log(h->s.avctx, AV_LOG_ERROR,
3689                "B picture before any references, skipping\n");
3690         return -1;
3691     }
3692
3693     pps_id= get_ue_golomb(&s->gb);
3694     if(pps_id>=MAX_PPS_COUNT){
3695         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3696         return -1;
3697     }
3698     if(!h0->pps_buffers[pps_id]) {
3699         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3700         return -1;
3701     }
3702     h->pps= *h0->pps_buffers[pps_id];
3703
3704     if(!h0->sps_buffers[h->pps.sps_id]) {
3705         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3706         return -1;
3707     }
3708     h->sps = *h0->sps_buffers[h->pps.sps_id];
3709
3710     if(h == h0 && h->dequant_coeff_pps != pps_id){
3711         h->dequant_coeff_pps = pps_id;
3712         init_dequant_tables(h);
3713     }
3714
3715     s->mb_width= h->sps.mb_width;
3716     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3717
3718     h->b_stride=  s->mb_width*4;
3719     h->b8_stride= s->mb_width*2;
3720
3721     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3722     if(h->sps.frame_mbs_only_flag)
3723         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3724     else
3725         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3726
3727     if (s->context_initialized
3728         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3729         if(h != h0)
3730             return -1;   // width / height changed during parallelized decoding
3731         free_tables(h);
3732         flush_dpb(s->avctx);
3733         MPV_common_end(s);
3734     }
3735     if (!s->context_initialized) {
3736         if(h != h0)
3737             return -1;  // we cant (re-)initialize context during parallel decoding
3738         if (MPV_common_init(s) < 0)
3739             return -1;
3740         s->first_field = 0;
3741
3742         init_scan_tables(h);
3743         alloc_tables(h);
3744
3745         for(i = 1; i < s->avctx->thread_count; i++) {
3746             H264Context *c;
3747             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3748             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3749             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3750             c->sps = h->sps;
3751             c->pps = h->pps;
3752             init_scan_tables(c);
3753             clone_tables(c, h);
3754         }
3755
3756         for(i = 0; i < s->avctx->thread_count; i++)
3757             if(context_init(h->thread_context[i]) < 0)
3758                 return -1;
3759
3760         s->avctx->width = s->width;
3761         s->avctx->height = s->height;
3762         s->avctx->sample_aspect_ratio= h->sps.sar;
3763         if(!s->avctx->sample_aspect_ratio.den)
3764             s->avctx->sample_aspect_ratio.den = 1;
3765
3766         if(h->sps.timing_info_present_flag){
3767             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3768             if(h->x264_build > 0 && h->x264_build < 44)
3769                 s->avctx->time_base.den *= 2;
3770             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3771                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3772         }
3773     }
3774
3775     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3776
3777     h->mb_mbaff = 0;
3778     h->mb_aff_frame = 0;
3779     last_pic_structure = s0->picture_structure;
3780     if(h->sps.frame_mbs_only_flag){
3781         s->picture_structure= PICT_FRAME;
3782     }else{
3783         if(get_bits1(&s->gb)) { //field_pic_flag
3784             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3785         } else {
3786             s->picture_structure= PICT_FRAME;
3787             h->mb_aff_frame = h->sps.mb_aff;
3788         }
3789     }
3790     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3791
3792     if(h0->current_slice == 0){
3793         while(h->frame_num !=  h->prev_frame_num &&
3794               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3795             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3796             if (frame_start(h) < 0)
3797                 return -1;
3798             h->prev_frame_num++;
3799             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3800             s->current_picture_ptr->frame_num= h->prev_frame_num;
3801             execute_ref_pic_marking(h, NULL, 0);
3802         }
3803
3804         /* See if we have a decoded first field looking for a pair... */
3805         if (s0->first_field) {
3806             assert(s0->current_picture_ptr);
3807             assert(s0->current_picture_ptr->data[0]);
3808             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3809
3810             /* figure out if we have a complementary field pair */
3811             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3812                 /*
3813                  * Previous field is unmatched. Don't display it, but let it
3814                  * remain for reference if marked as such.
3815                  */
3816                 s0->current_picture_ptr = NULL;
3817                 s0->first_field = FIELD_PICTURE;
3818
3819             } else {
3820                 if (h->nal_ref_idc &&
3821                         s0->current_picture_ptr->reference &&
3822                         s0->current_picture_ptr->frame_num != h->frame_num) {
3823                     /*
3824                      * This and previous field were reference, but had
3825                      * different frame_nums. Consider this field first in
3826                      * pair. Throw away previous field except for reference
3827                      * purposes.
3828                      */
3829                     s0->first_field = 1;
3830                     s0->current_picture_ptr = NULL;
3831
3832                 } else {
3833                     /* Second field in complementary pair */
3834                     s0->first_field = 0;
3835                 }
3836             }
3837
3838         } else {
3839             /* Frame or first field in a potentially complementary pair */
3840             assert(!s0->current_picture_ptr);
3841             s0->first_field = FIELD_PICTURE;
3842         }
3843
3844         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3845             s0->first_field = 0;
3846             return -1;
3847         }
3848     }
3849     if(h != h0)
3850         clone_slice(h, h0);
3851
3852     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3853
3854     assert(s->mb_num == s->mb_width * s->mb_height);
3855     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3856        first_mb_in_slice                    >= s->mb_num){
3857         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3858         return -1;
3859     }
3860     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3861     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3862     if (s->picture_structure == PICT_BOTTOM_FIELD)
3863         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3864     assert(s->mb_y < s->mb_height);
3865
3866     if(s->picture_structure==PICT_FRAME){
3867         h->curr_pic_num=   h->frame_num;
3868         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3869     }else{
3870         h->curr_pic_num= 2*h->frame_num + 1;
3871         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3872     }
3873
3874     if(h->nal_unit_type == NAL_IDR_SLICE){
3875         get_ue_golomb(&s->gb); /* idr_pic_id */
3876     }
3877
3878     if(h->sps.poc_type==0){
3879         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3880
3881         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3882             h->delta_poc_bottom= get_se_golomb(&s->gb);
3883         }
3884     }
3885
3886     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3887         h->delta_poc[0]= get_se_golomb(&s->gb);
3888
3889         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3890             h->delta_poc[1]= get_se_golomb(&s->gb);
3891     }
3892
3893     init_poc(h);
3894
3895     if(h->pps.redundant_pic_cnt_present){
3896         h->redundant_pic_count= get_ue_golomb(&s->gb);
3897     }
3898
3899     //set defaults, might be overridden a few lines later
3900     h->ref_count[0]= h->pps.ref_count[0];
3901     h->ref_count[1]= h->pps.ref_count[1];
3902
3903     if(h->slice_type_nos != FF_I_TYPE){
3904         if(h->slice_type_nos == FF_B_TYPE){
3905             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3906         }
3907         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3908
3909         if(num_ref_idx_active_override_flag){
3910             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3911             if(h->slice_type_nos==FF_B_TYPE)
3912                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3913
3914             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3915                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3916                 h->ref_count[0]= h->ref_count[1]= 1;
3917                 return -1;
3918             }
3919         }
3920         if(h->slice_type_nos == FF_B_TYPE)
3921             h->list_count= 2;
3922         else
3923             h->list_count= 1;
3924     }else
3925         h->list_count= 0;
3926
3927     if(!default_ref_list_done){
3928         fill_default_ref_list(h);
3929     }
3930
3931     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3932         return -1;
3933
3934     if(h->slice_type_nos!=FF_I_TYPE){
3935         s->last_picture_ptr= &h->ref_list[0][0];
3936         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3937     }
3938     if(h->slice_type_nos==FF_B_TYPE){
3939         s->next_picture_ptr= &h->ref_list[1][0];
3940         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3941     }
3942
3943     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3944        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3945         pred_weight_table(h);
3946     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3947         implicit_weight_table(h);
3948     else {
3949         h->use_weight = 0;
3950         for (i = 0; i < 2; i++) {
3951             h->luma_weight_flag[i]   = 0;
3952             h->chroma_weight_flag[i] = 0;
3953         }
3954     }
3955
3956     if(h->nal_ref_idc)
3957         decode_ref_pic_marking(h0, &s->gb);
3958
3959     if(FRAME_MBAFF)
3960         fill_mbaff_ref_list(h);
3961
3962     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3963         direct_dist_scale_factor(h);
3964     direct_ref_list_init(h);
3965
3966     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3967         tmp = get_ue_golomb_31(&s->gb);
3968         if(tmp > 2){
3969             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3970             return -1;
3971         }
3972         h->cabac_init_idc= tmp;
3973     }
3974
3975     h->last_qscale_diff = 0;
3976     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3977     if(tmp>51){
3978         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3979         return -1;
3980     }
3981     s->qscale= tmp;
3982     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3983     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3984     //FIXME qscale / qp ... stuff
3985     if(h->slice_type == FF_SP_TYPE){
3986         get_bits1(&s->gb); /* sp_for_switch_flag */
3987     }
3988     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3989         get_se_golomb(&s->gb); /* slice_qs_delta */
3990     }
3991
3992     h->deblocking_filter = 1;
3993     h->slice_alpha_c0_offset = 0;
3994     h->slice_beta_offset = 0;
3995     if( h->pps.deblocking_filter_parameters_present ) {
3996         tmp= get_ue_golomb_31(&s->gb);
3997         if(tmp > 2){
3998             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3999             return -1;
4000         }
4001         h->deblocking_filter= tmp;
4002         if(h->deblocking_filter < 2)
4003             h->deblocking_filter^= 1; // 1<->0
4004
4005         if( h->deblocking_filter ) {
4006             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4007             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4008         }
4009     }
4010
4011     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4012        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4013        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4014        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4015         h->deblocking_filter= 0;
4016
4017     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4018         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4019             /* Cheat slightly for speed:
4020                Do not bother to deblock across slices. */
4021             h->deblocking_filter = 2;
4022         } else {
4023             h0->max_contexts = 1;
4024             if(!h0->single_decode_warning) {
4025                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4026                 h0->single_decode_warning = 1;
4027             }
4028             if(h != h0)
4029                 return 1; // deblocking switched inside frame
4030         }
4031     }
4032
4033 #if 0 //FMO
4034     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4035         slice_group_change_cycle= get_bits(&s->gb, ?);
4036 #endif
4037
4038     h0->last_slice_type = slice_type;
4039     h->slice_num = ++h0->current_slice;
4040     if(h->slice_num >= MAX_SLICES){
4041         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4042     }
4043
4044     for(j=0; j<2; j++){
4045         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4046         ref2frm[0]=
4047         ref2frm[1]= -1;
4048         for(i=0; i<16; i++)
4049             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4050                           +(h->ref_list[j][i].reference&3);
4051         ref2frm[18+0]=
4052         ref2frm[18+1]= -1;
4053         for(i=16; i<48; i++)
4054             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4055                           +(h->ref_list[j][i].reference&3);
4056     }
4057
4058     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4059     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4060
4061     s->avctx->refs= h->sps.ref_frame_count;
4062
4063     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4064         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4065                h->slice_num,
4066                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4067                first_mb_in_slice,
4068                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4069                pps_id, h->frame_num,
4070                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4071                h->ref_count[0], h->ref_count[1],
4072                s->qscale,
4073                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4074                h->use_weight,
4075                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4076                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4077                );
4078     }
4079
4080     return 0;
4081 }
4082
4083 /**
4084  *
4085  */
4086 static inline int get_level_prefix(GetBitContext *gb){
4087     unsigned int buf;
4088     int log;
4089
4090     OPEN_READER(re, gb);
4091     UPDATE_CACHE(re, gb);
4092     buf=GET_CACHE(re, gb);
4093
4094     log= 32 - av_log2(buf);
4095 #ifdef TRACE
4096     print_bin(buf>>(32-log), log);
4097     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4098 #endif
4099
4100     LAST_SKIP_BITS(re, gb, log);
4101     CLOSE_READER(re, gb);
4102
4103     return log-1;
4104 }
4105
4106 static inline int get_dct8x8_allowed(H264Context *h){
4107     if(h->sps.direct_8x8_inference_flag)
4108         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4109     else
4110         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4111 }
4112
4113 /**
4114  * decodes a residual block.
4115  * @param n block index
4116  * @param scantable scantable
4117  * @param max_coeff number of coefficients in the block
4118  * @return <0 if an error occurred
4119  */
4120 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4121     MpegEncContext * const s = &h->s;
4122     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4123     int level[16];
4124     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4125
4126     //FIXME put trailing_onex into the context
4127
4128     if(n == CHROMA_DC_BLOCK_INDEX){
4129         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4130         total_coeff= coeff_token>>2;
4131     }else{
4132         if(n == LUMA_DC_BLOCK_INDEX){
4133             total_coeff= pred_non_zero_count(h, 0);
4134             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4135             total_coeff= coeff_token>>2;
4136         }else{
4137             total_coeff= pred_non_zero_count(h, n);
4138             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4139             total_coeff= coeff_token>>2;
4140             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4141         }
4142     }
4143
4144     //FIXME set last_non_zero?
4145
4146     if(total_coeff==0)
4147         return 0;
4148     if(total_coeff > (unsigned)max_coeff) {
4149         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4150         return -1;
4151     }
4152
4153     trailing_ones= coeff_token&3;
4154     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4155     assert(total_coeff<=16);
4156
4157     i = show_bits(gb, 3);
4158     skip_bits(gb, trailing_ones);
4159     level[0] = 1-((i&4)>>1);
4160     level[1] = 1-((i&2)   );
4161     level[2] = 1-((i&1)<<1);
4162
4163     if(trailing_ones<total_coeff) {
4164         int mask, prefix;
4165         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4166         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4167         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4168
4169         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4170         if(level_code >= 100){
4171             prefix= level_code - 100;
4172             if(prefix == LEVEL_TAB_BITS)
4173                 prefix += get_level_prefix(gb);
4174
4175             //first coefficient has suffix_length equal to 0 or 1
4176             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4177                 if(suffix_length)
4178                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4179                 else
4180                     level_code= (prefix<<suffix_length); //part
4181             }else if(prefix==14){
4182                 if(suffix_length)
4183                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4184                 else
4185                     level_code= prefix + get_bits(gb, 4); //part
4186             }else{
4187                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4188                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4189                 if(prefix>=16)
4190                     level_code += (1<<(prefix-3))-4096;
4191             }
4192
4193             if(trailing_ones < 3) level_code += 2;
4194
4195             suffix_length = 2;
4196             mask= -(level_code&1);
4197             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4198         }else{
4199             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4200
4201             suffix_length = 1;
4202             if(level_code + 3U > 6U)
4203                 suffix_length++;
4204             level[trailing_ones]= level_code;
4205         }
4206
4207         //remaining coefficients have suffix_length > 0
4208         for(i=trailing_ones+1;i<total_coeff;i++) {
4209             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4210             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4211             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4212
4213             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4214             if(level_code >= 100){
4215                 prefix= level_code - 100;
4216                 if(prefix == LEVEL_TAB_BITS){
4217                     prefix += get_level_prefix(gb);
4218                 }
4219                 if(prefix<15){
4220                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4221                 }else{
4222                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4223                     if(prefix>=16)
4224                         level_code += (1<<(prefix-3))-4096;
4225                 }
4226                 mask= -(level_code&1);
4227                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4228             }
4229             level[i]= level_code;
4230
4231             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4232                 suffix_length++;
4233         }
4234     }
4235
4236     if(total_coeff == max_coeff)
4237         zeros_left=0;
4238     else{
4239         if(n == CHROMA_DC_BLOCK_INDEX)
4240             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4241         else
4242             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4243     }
4244
4245     coeff_num = zeros_left + total_coeff - 1;
4246     j = scantable[coeff_num];
4247     if(n > 24){
4248         block[j] = level[0];
4249         for(i=1;i<total_coeff;i++) {
4250             if(zeros_left <= 0)
4251                 run_before = 0;
4252             else if(zeros_left < 7){
4253                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4254             }else{
4255                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4256             }
4257             zeros_left -= run_before;
4258             coeff_num -= 1 + run_before;
4259             j= scantable[ coeff_num ];
4260
4261             block[j]= level[i];
4262         }
4263     }else{
4264         block[j] = (level[0] * qmul[j] + 32)>>6;
4265         for(i=1;i<total_coeff;i++) {
4266             if(zeros_left <= 0)
4267                 run_before = 0;
4268             else if(zeros_left < 7){
4269                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4270             }else{
4271                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4272             }
4273             zeros_left -= run_before;
4274             coeff_num -= 1 + run_before;
4275             j= scantable[ coeff_num ];
4276
4277             block[j]= (level[i] * qmul[j] + 32)>>6;
4278         }
4279     }
4280
4281     if(zeros_left<0){
4282         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4283         return -1;
4284     }
4285
4286     return 0;
4287 }
4288
4289 static void predict_field_decoding_flag(H264Context *h){
4290     MpegEncContext * const s = &h->s;
4291     const int mb_xy= h->mb_xy;
4292     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4293                 ? s->current_picture.mb_type[mb_xy-1]
4294                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4295                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4296                 : 0;
4297     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4298 }
4299
4300 /**
4301  * decodes a P_SKIP or B_SKIP macroblock
4302  */
4303 static void decode_mb_skip(H264Context *h){
4304     MpegEncContext * const s = &h->s;
4305     const int mb_xy= h->mb_xy;
4306     int mb_type=0;
4307
4308     memset(h->non_zero_count[mb_xy], 0, 16);
4309     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4310
4311     if(MB_FIELD)
4312         mb_type|= MB_TYPE_INTERLACED;
4313
4314     if( h->slice_type_nos == FF_B_TYPE )
4315     {
4316         // just for fill_caches. pred_direct_motion will set the real mb_type
4317         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4318
4319         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4320         pred_direct_motion(h, &mb_type);
4321         mb_type|= MB_TYPE_SKIP;
4322     }
4323     else
4324     {
4325         int mx, my;
4326         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4327
4328         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4329         pred_pskip_motion(h, &mx, &my);
4330         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4331         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4332     }
4333
4334     write_back_motion(h, mb_type);
4335     s->current_picture.mb_type[mb_xy]= mb_type;
4336     s->current_picture.qscale_table[mb_xy]= s->qscale;
4337     h->slice_table[ mb_xy ]= h->slice_num;
4338     h->prev_mb_skipped= 1;
4339 }
4340
4341 /**
4342  * decodes a macroblock
4343  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4344  */
4345 static int decode_mb_cavlc(H264Context *h){
4346     MpegEncContext * const s = &h->s;
4347     int mb_xy;
4348     int partition_count;
4349     unsigned int mb_type, cbp;
4350     int dct8x8_allowed= h->pps.transform_8x8_mode;
4351
4352     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4353
4354     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4355     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4356                 down the code */
4357     if(h->slice_type_nos != FF_I_TYPE){
4358         if(s->mb_skip_run==-1)
4359             s->mb_skip_run= get_ue_golomb(&s->gb);
4360
4361         if (s->mb_skip_run--) {
4362             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4363                 if(s->mb_skip_run==0)
4364                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4365                 else
4366                     predict_field_decoding_flag(h);
4367             }
4368             decode_mb_skip(h);
4369             return 0;
4370         }
4371     }
4372     if(FRAME_MBAFF){
4373         if( (s->mb_y&1) == 0 )
4374             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4375     }
4376
4377     h->prev_mb_skipped= 0;
4378
4379     mb_type= get_ue_golomb(&s->gb);
4380     if(h->slice_type_nos == FF_B_TYPE){
4381         if(mb_type < 23){
4382             partition_count= b_mb_type_info[mb_type].partition_count;
4383             mb_type=         b_mb_type_info[mb_type].type;
4384         }else{
4385             mb_type -= 23;
4386             goto decode_intra_mb;
4387         }
4388     }else if(h->slice_type_nos == FF_P_TYPE){
4389         if(mb_type < 5){
4390             partition_count= p_mb_type_info[mb_type].partition_count;
4391             mb_type=         p_mb_type_info[mb_type].type;
4392         }else{
4393             mb_type -= 5;
4394             goto decode_intra_mb;
4395         }
4396     }else{
4397        assert(h->slice_type_nos == FF_I_TYPE);
4398         if(h->slice_type == FF_SI_TYPE && mb_type)
4399             mb_type--;
4400 decode_intra_mb:
4401         if(mb_type > 25){
4402             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4403             return -1;
4404         }
4405         partition_count=0;
4406         cbp= i_mb_type_info[mb_type].cbp;
4407         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4408         mb_type= i_mb_type_info[mb_type].type;
4409     }
4410
4411     if(MB_FIELD)
4412         mb_type |= MB_TYPE_INTERLACED;
4413
4414     h->slice_table[ mb_xy ]= h->slice_num;
4415
4416     if(IS_INTRA_PCM(mb_type)){
4417         unsigned int x;
4418
4419         // We assume these blocks are very rare so we do not optimize it.
4420         align_get_bits(&s->gb);
4421
4422         // The pixels are stored in the same order as levels in h->mb array.
4423         for(x=0; x < (CHROMA ? 384 : 256); x++){
4424             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4425         }
4426
4427         // In deblocking, the quantizer is 0
4428         s->current_picture.qscale_table[mb_xy]= 0;
4429         // All coeffs are present
4430         memset(h->non_zero_count[mb_xy], 16, 16);
4431
4432         s->current_picture.mb_type[mb_xy]= mb_type;
4433         return 0;
4434     }
4435
4436     if(MB_MBAFF){
4437         h->ref_count[0] <<= 1;
4438         h->ref_count[1] <<= 1;
4439     }
4440
4441     fill_caches(h, mb_type, 0);
4442
4443     //mb_pred
4444     if(IS_INTRA(mb_type)){
4445         int pred_mode;
4446 //            init_top_left_availability(h);
4447         if(IS_INTRA4x4(mb_type)){
4448             int i;
4449             int di = 1;
4450             if(dct8x8_allowed && get_bits1(&s->gb)){
4451                 mb_type |= MB_TYPE_8x8DCT;
4452                 di = 4;
4453             }
4454
4455 //                fill_intra4x4_pred_table(h);
4456             for(i=0; i<16; i+=di){
4457                 int mode= pred_intra_mode(h, i);
4458
4459                 if(!get_bits1(&s->gb)){
4460                     const int rem_mode= get_bits(&s->gb, 3);
4461                     mode = rem_mode + (rem_mode >= mode);
4462                 }
4463
4464                 if(di==4)
4465                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4466                 else
4467                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4468             }
4469             write_back_intra_pred_mode(h);
4470             if( check_intra4x4_pred_mode(h) < 0)
4471                 return -1;
4472         }else{
4473             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4474             if(h->intra16x16_pred_mode < 0)
4475                 return -1;
4476         }
4477         if(CHROMA){
4478             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4479             if(pred_mode < 0)
4480                 return -1;
4481             h->chroma_pred_mode= pred_mode;
4482         }
4483     }else if(partition_count==4){
4484         int i, j, sub_partition_count[4], list, ref[2][4];
4485
4486         if(h->slice_type_nos == FF_B_TYPE){
4487             for(i=0; i<4; i++){
4488                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4489                 if(h->sub_mb_type[i] >=13){
4490                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4491                     return -1;
4492                 }
4493                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4494                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4495             }
4496             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4497                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4498                 pred_direct_motion(h, &mb_type);
4499                 h->ref_cache[0][scan8[4]] =
4500                 h->ref_cache[1][scan8[4]] =
4501                 h->ref_cache[0][scan8[12]] =
4502                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4503             }
4504         }else{
4505             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4506             for(i=0; i<4; i++){
4507                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4508                 if(h->sub_mb_type[i] >=4){
4509                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4510                     return -1;
4511                 }
4512                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4513                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4514             }
4515         }
4516
4517         for(list=0; list<h->list_count; list++){
4518             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4519             for(i=0; i<4; i++){
4520                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4521                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4522                     unsigned int tmp;
4523                     if(ref_count == 1){
4524                         tmp= 0;
4525                     }else if(ref_count == 2){
4526                         tmp= get_bits1(&s->gb)^1;
4527                     }else{
4528                         tmp= get_ue_golomb_31(&s->gb);
4529                         if(tmp>=ref_count){
4530                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4531                             return -1;
4532                         }
4533                     }
4534                     ref[list][i]= tmp;
4535                 }else{
4536                  //FIXME
4537                     ref[list][i] = -1;
4538                 }
4539             }
4540         }
4541
4542         if(dct8x8_allowed)
4543             dct8x8_allowed = get_dct8x8_allowed(h);
4544
4545         for(list=0; list<h->list_count; list++){
4546             for(i=0; i<4; i++){
4547                 if(IS_DIRECT(h->sub_mb_type[i])) {
4548                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4549                     continue;
4550                 }
4551                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4552                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4553
4554                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4555                     const int sub_mb_type= h->sub_mb_type[i];
4556                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4557                     for(j=0; j<sub_partition_count[i]; j++){
4558                         int mx, my;
4559                         const int index= 4*i + block_width*j;
4560                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4561                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4562                         mx += get_se_golomb(&s->gb);
4563                         my += get_se_golomb(&s->gb);
4564                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4565
4566                         if(IS_SUB_8X8(sub_mb_type)){
4567                             mv_cache[ 1 ][0]=
4568                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4569                             mv_cache[ 1 ][1]=
4570                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4571                         }else if(IS_SUB_8X4(sub_mb_type)){
4572                             mv_cache[ 1 ][0]= mx;
4573                             mv_cache[ 1 ][1]= my;
4574                         }else if(IS_SUB_4X8(sub_mb_type)){
4575                             mv_cache[ 8 ][0]= mx;
4576                             mv_cache[ 8 ][1]= my;
4577                         }
4578                         mv_cache[ 0 ][0]= mx;
4579                         mv_cache[ 0 ][1]= my;
4580                     }
4581                 }else{
4582                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4583                     p[0] = p[1]=
4584                     p[8] = p[9]= 0;
4585                 }
4586             }
4587         }
4588     }else if(IS_DIRECT(mb_type)){
4589         pred_direct_motion(h, &mb_type);
4590         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4591     }else{
4592         int list, mx, my, i;
4593          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4594         if(IS_16X16(mb_type)){
4595             for(list=0; list<h->list_count; list++){
4596                     unsigned int val;
4597                     if(IS_DIR(mb_type, 0, list)){
4598                         if(h->ref_count[list]==1){
4599                             val= 0;
4600                         }else if(h->ref_count[list]==2){
4601                             val= get_bits1(&s->gb)^1;
4602                         }else{
4603                             val= get_ue_golomb_31(&s->gb);
4604                             if(val >= h->ref_count[list]){
4605                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4606                                 return -1;
4607                             }
4608                         }
4609                     }else
4610                         val= LIST_NOT_USED&0xFF;
4611                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4612             }
4613             for(list=0; list<h->list_count; list++){
4614                 unsigned int val;
4615                 if(IS_DIR(mb_type, 0, list)){
4616                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4617                     mx += get_se_golomb(&s->gb);
4618                     my += get_se_golomb(&s->gb);
4619                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4620
4621                     val= pack16to32(mx,my);
4622                 }else
4623                     val=0;
4624                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4625             }
4626         }
4627         else if(IS_16X8(mb_type)){
4628             for(list=0; list<h->list_count; list++){
4629                     for(i=0; i<2; i++){
4630                         unsigned int val;
4631                         if(IS_DIR(mb_type, i, list)){
4632                             if(h->ref_count[list] == 1){
4633                                 val= 0;
4634                             }else if(h->ref_count[list] == 2){
4635                                 val= get_bits1(&s->gb)^1;
4636                             }else{
4637                                 val= get_ue_golomb_31(&s->gb);
4638                                 if(val >= h->ref_count[list]){
4639                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4640                                     return -1;
4641                                 }
4642                             }
4643                         }else
4644                             val= LIST_NOT_USED&0xFF;
4645                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4646                     }
4647             }
4648             for(list=0; list<h->list_count; list++){
4649                 for(i=0; i<2; i++){
4650                     unsigned int val;
4651                     if(IS_DIR(mb_type, i, list)){
4652                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4653                         mx += get_se_golomb(&s->gb);
4654                         my += get_se_golomb(&s->gb);
4655                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4656
4657                         val= pack16to32(mx,my);
4658                     }else
4659                         val=0;
4660                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4661                 }
4662             }
4663         }else{
4664             assert(IS_8X16(mb_type));
4665             for(list=0; list<h->list_count; list++){
4666                     for(i=0; i<2; i++){
4667                         unsigned int val;
4668                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4669                             if(h->ref_count[list]==1){
4670                                 val= 0;
4671                             }else if(h->ref_count[list]==2){
4672                                 val= get_bits1(&s->gb)^1;
4673                             }else{
4674                                 val= get_ue_golomb_31(&s->gb);
4675                                 if(val >= h->ref_count[list]){
4676                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4677                                     return -1;
4678                                 }
4679                             }
4680                         }else
4681                             val= LIST_NOT_USED&0xFF;
4682                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4683                     }
4684             }
4685             for(list=0; list<h->list_count; list++){
4686                 for(i=0; i<2; i++){
4687                     unsigned int val;
4688                     if(IS_DIR(mb_type, i, list)){
4689                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4690                         mx += get_se_golomb(&s->gb);
4691                         my += get_se_golomb(&s->gb);
4692                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4693
4694                         val= pack16to32(mx,my);
4695                     }else
4696                         val=0;
4697                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4698                 }
4699             }
4700         }
4701     }
4702
4703     if(IS_INTER(mb_type))
4704         write_back_motion(h, mb_type);
4705
4706     if(!IS_INTRA16x16(mb_type)){
4707         cbp= get_ue_golomb(&s->gb);
4708         if(cbp > 47){
4709             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4710             return -1;
4711         }
4712
4713         if(CHROMA){
4714             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4715             else                     cbp= golomb_to_inter_cbp   [cbp];
4716         }else{
4717             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4718             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4719         }
4720     }
4721     h->cbp = cbp;
4722
4723     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4724         if(get_bits1(&s->gb)){
4725             mb_type |= MB_TYPE_8x8DCT;
4726             h->cbp_table[mb_xy]= cbp;
4727         }
4728     }
4729     s->current_picture.mb_type[mb_xy]= mb_type;
4730
4731     if(cbp || IS_INTRA16x16(mb_type)){
4732         int i8x8, i4x4, chroma_idx;
4733         int dquant;
4734         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4735         const uint8_t *scan, *scan8x8, *dc_scan;
4736
4737 //        fill_non_zero_count_cache(h);
4738
4739         if(IS_INTERLACED(mb_type)){
4740             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4741             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4742             dc_scan= luma_dc_field_scan;
4743         }else{
4744             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4745             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4746             dc_scan= luma_dc_zigzag_scan;
4747         }
4748
4749         dquant= get_se_golomb(&s->gb);
4750
4751         if( dquant > 25 || dquant < -26 ){
4752             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4753             return -1;
4754         }
4755
4756         s->qscale += dquant;
4757         if(((unsigned)s->qscale) > 51){
4758             if(s->qscale<0) s->qscale+= 52;
4759             else            s->qscale-= 52;
4760         }
4761
4762         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4763         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4764         if(IS_INTRA16x16(mb_type)){
4765             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4766                 return -1; //FIXME continue if partitioned and other return -1 too
4767             }
4768
4769             assert((cbp&15) == 0 || (cbp&15) == 15);
4770
4771             if(cbp&15){
4772                 for(i8x8=0; i8x8<4; i8x8++){
4773                     for(i4x4=0; i4x4<4; i4x4++){
4774                         const int index= i4x4 + 4*i8x8;
4775                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4776                             return -1;
4777                         }
4778                     }
4779                 }
4780             }else{
4781                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4782             }
4783         }else{
4784             for(i8x8=0; i8x8<4; i8x8++){
4785                 if(cbp & (1<<i8x8)){
4786                     if(IS_8x8DCT(mb_type)){
4787                         DCTELEM *buf = &h->mb[64*i8x8];
4788                         uint8_t *nnz;
4789                         for(i4x4=0; i4x4<4; i4x4++){
4790                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4791                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4792                                 return -1;
4793                         }
4794                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4795                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4796                     }else{
4797                         for(i4x4=0; i4x4<4; i4x4++){
4798                             const int index= i4x4 + 4*i8x8;
4799
4800                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4801                                 return -1;
4802                             }
4803                         }
4804                     }
4805                 }else{
4806                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4807                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4808                 }
4809             }
4810         }
4811
4812         if(cbp&0x30){
4813             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4814                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4815                     return -1;
4816                 }
4817         }
4818
4819         if(cbp&0x20){
4820             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4821                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4822                 for(i4x4=0; i4x4<4; i4x4++){
4823                     const int index= 16 + 4*chroma_idx + i4x4;
4824                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4825                         return -1;
4826                     }
4827                 }
4828             }
4829         }else{
4830             uint8_t * const nnz= &h->non_zero_count_cache[0];
4831             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4832             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4833         }
4834     }else{
4835         uint8_t * const nnz= &h->non_zero_count_cache[0];
4836         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4837         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4838         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4839     }
4840     s->current_picture.qscale_table[mb_xy]= s->qscale;
4841     write_back_non_zero_count(h);
4842
4843     if(MB_MBAFF){
4844         h->ref_count[0] >>= 1;
4845         h->ref_count[1] >>= 1;
4846     }
4847
4848     return 0;
4849 }
4850
4851 static int decode_cabac_field_decoding_flag(H264Context *h) {
4852     MpegEncContext * const s = &h->s;
4853     const int mb_x = s->mb_x;
4854     const int mb_y = s->mb_y & ~1;
4855     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4856     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4857
4858     unsigned int ctx = 0;
4859
4860     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4861         ctx += 1;
4862     }
4863     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4864         ctx += 1;
4865     }
4866
4867     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4868 }
4869
4870 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4871     uint8_t *state= &h->cabac_state[ctx_base];
4872     int mb_type;
4873
4874     if(intra_slice){
4875         MpegEncContext * const s = &h->s;
4876         const int mba_xy = h->left_mb_xy[0];
4877         const int mbb_xy = h->top_mb_xy;
4878         int ctx=0;
4879         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4880             ctx++;
4881         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4882             ctx++;
4883         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4884             return 0;   /* I4x4 */
4885         state += 2;
4886     }else{
4887         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4888             return 0;   /* I4x4 */
4889     }
4890
4891     if( get_cabac_terminate( &h->cabac ) )
4892         return 25;  /* PCM */
4893
4894     mb_type = 1; /* I16x16 */
4895     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4896     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4897         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4898     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4899     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4900     return mb_type;
4901 }
4902
4903 static int decode_cabac_mb_type_b( H264Context *h ) {
4904     MpegEncContext * const s = &h->s;
4905
4906         const int mba_xy = h->left_mb_xy[0];
4907         const int mbb_xy = h->top_mb_xy;
4908         int ctx = 0;
4909         int bits;
4910         assert(h->slice_type_nos == FF_B_TYPE);
4911
4912         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4913             ctx++;
4914         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4915             ctx++;
4916
4917         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4918             return 0; /* B_Direct_16x16 */
4919
4920         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4921             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4922         }
4923
4924         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4925         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4926         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4927         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4928         if( bits < 8 )
4929             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4930         else if( bits == 13 ) {
4931             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4932         } else if( bits == 14 )
4933             return 11; /* B_L1_L0_8x16 */
4934         else if( bits == 15 )
4935             return 22; /* B_8x8 */
4936
4937         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4938         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4939 }
4940
4941 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4942     MpegEncContext * const s = &h->s;
4943     int mba_xy, mbb_xy;
4944     int ctx = 0;
4945
4946     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4947         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4948         mba_xy = mb_xy - 1;
4949         if( (mb_y&1)
4950             && h->slice_table[mba_xy] == h->slice_num
4951             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4952             mba_xy += s->mb_stride;
4953         if( MB_FIELD ){
4954             mbb_xy = mb_xy - s->mb_stride;
4955             if( !(mb_y&1)
4956                 && h->slice_table[mbb_xy] == h->slice_num
4957                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4958                 mbb_xy -= s->mb_stride;
4959         }else
4960             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4961     }else{
4962         int mb_xy = h->mb_xy;
4963         mba_xy = mb_xy - 1;
4964         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4965     }
4966
4967     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4968         ctx++;
4969     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4970         ctx++;
4971
4972     if( h->slice_type_nos == FF_B_TYPE )
4973         ctx += 13;
4974     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4975 }
4976
4977 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4978     int mode = 0;
4979
4980     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4981         return pred_mode;
4982
4983     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4984     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4985     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4986
4987     if( mode >= pred_mode )
4988         return mode + 1;
4989     else
4990         return mode;
4991 }
4992
4993 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4994     const int mba_xy = h->left_mb_xy[0];
4995     const int mbb_xy = h->top_mb_xy;
4996
4997     int ctx = 0;
4998
4999     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5000     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5001         ctx++;
5002
5003     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5004         ctx++;
5005
5006     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5007         return 0;
5008
5009     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5010         return 1;
5011     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5012         return 2;
5013     else
5014         return 3;
5015 }
5016
5017 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5018     int cbp_b, cbp_a, ctx, cbp = 0;
5019
5020     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5021     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5022
5023     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5024     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5025     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5026     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5027     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5028     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5029     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5030     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5031     return cbp;
5032 }
5033 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5034     int ctx;
5035     int cbp_a, cbp_b;
5036
5037     cbp_a = (h->left_cbp>>4)&0x03;
5038     cbp_b = (h-> top_cbp>>4)&0x03;
5039
5040     ctx = 0;
5041     if( cbp_a > 0 ) ctx++;
5042     if( cbp_b > 0 ) ctx += 2;
5043     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5044         return 0;
5045
5046     ctx = 4;
5047     if( cbp_a == 2 ) ctx++;
5048     if( cbp_b == 2 ) ctx += 2;
5049     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5050 }
5051 static int decode_cabac_mb_dqp( H264Context *h) {
5052     int   ctx= h->last_qscale_diff != 0;
5053     int   val = 0;
5054
5055     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5056         ctx= 2+(ctx>>1);
5057         val++;
5058         if(val > 102) //prevent infinite loop
5059             return INT_MIN;
5060     }
5061
5062     if( val&0x01 )
5063         return   (val + 1)>>1 ;
5064     else
5065         return -((val + 1)>>1);
5066 }
5067 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5068     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5069         return 0;   /* 8x8 */
5070     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5071         return 1;   /* 8x4 */
5072     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5073         return 2;   /* 4x8 */
5074     return 3;       /* 4x4 */
5075 }
5076 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5077     int type;
5078     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5079         return 0;   /* B_Direct_8x8 */
5080     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5081         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5082     type = 3;
5083     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5084         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5085             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5086         type += 4;
5087     }
5088     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5089     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5090     return type;
5091 }
5092
5093 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5094     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5095 }
5096
5097 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5098     int refa = h->ref_cache[list][scan8[n] - 1];
5099     int refb = h->ref_cache[list][scan8[n] - 8];
5100     int ref  = 0;
5101     int ctx  = 0;
5102
5103     if( h->slice_type_nos == FF_B_TYPE) {
5104         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5105             ctx++;
5106         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5107             ctx += 2;
5108     } else {
5109         if( refa > 0 )
5110             ctx++;
5111         if( refb > 0 )
5112             ctx += 2;
5113     }
5114
5115     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5116         ref++;
5117         ctx = (ctx>>2)+4;
5118         if(ref >= 32 /*h->ref_list[list]*/){
5119             return -1;
5120         }
5121     }
5122     return ref;
5123 }
5124
5125 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5126     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5127                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5128     int ctxbase = (l == 0) ? 40 : 47;
5129     int mvd;
5130     int ctx = (amvd>2) + (amvd>32);
5131
5132     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5133         return 0;
5134
5135     mvd= 1;
5136     ctx= 3;
5137     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5138         mvd++;
5139         if( ctx < 6 )
5140             ctx++;
5141     }
5142
5143     if( mvd >= 9 ) {
5144         int k = 3;
5145         while( get_cabac_bypass( &h->cabac ) ) {
5146             mvd += 1 << k;
5147             k++;
5148             if(k>24){
5149                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5150                 return INT_MIN;
5151             }
5152         }
5153         while( k-- ) {
5154             if( get_cabac_bypass( &h->cabac ) )
5155                 mvd += 1 << k;
5156         }
5157     }
5158     return get_cabac_bypass_sign( &h->cabac, -mvd );
5159 }
5160
5161 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5162     int nza, nzb;
5163     int ctx = 0;
5164
5165     if( is_dc ) {
5166         if( cat == 0 ) {
5167             nza = h->left_cbp&0x100;
5168             nzb = h-> top_cbp&0x100;
5169         } else {
5170             nza = (h->left_cbp>>(6+idx))&0x01;
5171             nzb = (h-> top_cbp>>(6+idx))&0x01;
5172         }
5173     } else {
5174         assert(cat == 1 || cat == 2 || cat == 4);
5175         nza = h->non_zero_count_cache[scan8[idx] - 1];
5176         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5177     }
5178
5179     if( nza > 0 )
5180         ctx++;
5181
5182     if( nzb > 0 )
5183         ctx += 2;
5184
5185     return ctx + 4 * cat;
5186 }
5187
5188 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5189     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5190     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5191     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5192     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5193 };
5194
5195 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5196     static const int significant_coeff_flag_offset[2][6] = {
5197       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5198       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5199     };
5200     static const int last_coeff_flag_offset[2][6] = {
5201       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5202       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5203     };
5204     static const int coeff_abs_level_m1_offset[6] = {
5205         227+0, 227+10, 227+20, 227+30, 227+39, 426
5206     };
5207     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5208       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5209         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5210         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5211        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5212       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5213         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5214         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5215         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5216     };
5217     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5218      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5219      * map node ctx => cabac ctx for level=1 */
5220     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5221     /* map node ctx => cabac ctx for level>1 */
5222     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5223     static const uint8_t coeff_abs_level_transition[2][8] = {
5224     /* update node ctx after decoding a level=1 */
5225         { 1, 2, 3, 3, 4, 5, 6, 7 },
5226     /* update node ctx after decoding a level>1 */
5227         { 4, 4, 4, 4, 5, 6, 7, 7 }
5228     };
5229
5230     int index[64];
5231
5232     int av_unused last;
5233     int coeff_count = 0;
5234     int node_ctx = 0;
5235
5236     uint8_t *significant_coeff_ctx_base;
5237     uint8_t *last_coeff_ctx_base;
5238     uint8_t *abs_level_m1_ctx_base;
5239
5240 #if !ARCH_X86
5241 #define CABAC_ON_STACK
5242 #endif
5243 #ifdef CABAC_ON_STACK
5244 #define CC &cc
5245     CABACContext cc;
5246     cc.range     = h->cabac.range;
5247     cc.low       = h->cabac.low;
5248     cc.bytestream= h->cabac.bytestream;
5249 #else
5250 #define CC &h->cabac
5251 #endif
5252
5253
5254     /* cat: 0-> DC 16x16  n = 0
5255      *      1-> AC 16x16  n = luma4x4idx
5256      *      2-> Luma4x4   n = luma4x4idx
5257      *      3-> DC Chroma n = iCbCr
5258      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5259      *      5-> Luma8x8   n = 4 * luma8x8idx
5260      */
5261
5262     /* read coded block flag */
5263     if( is_dc || cat != 5 ) {
5264         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5265             if( !is_dc )
5266                 h->non_zero_count_cache[scan8[n]] = 0;
5267
5268 #ifdef CABAC_ON_STACK
5269             h->cabac.range     = cc.range     ;
5270             h->cabac.low       = cc.low       ;
5271             h->cabac.bytestream= cc.bytestream;
5272 #endif
5273             return;
5274         }
5275     }
5276
5277     significant_coeff_ctx_base = h->cabac_state
5278         + significant_coeff_flag_offset[MB_FIELD][cat];
5279     last_coeff_ctx_base = h->cabac_state
5280         + last_coeff_flag_offset[MB_FIELD][cat];
5281     abs_level_m1_ctx_base = h->cabac_state
5282         + coeff_abs_level_m1_offset[cat];
5283
5284     if( !is_dc && cat == 5 ) {
5285 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5286         for(last= 0; last < coefs; last++) { \
5287             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5288             if( get_cabac( CC, sig_ctx )) { \
5289                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5290                 index[coeff_count++] = last; \
5291                 if( get_cabac( CC, last_ctx ) ) { \
5292                     last= max_coeff; \
5293                     break; \
5294                 } \
5295             } \
5296         }\
5297         if( last == max_coeff -1 ) {\
5298             index[coeff_count++] = last;\
5299         }
5300         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5301 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5302         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5303     } else {
5304         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5305 #else
5306         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5307     } else {
5308         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5309 #endif
5310     }
5311     assert(coeff_count > 0);
5312
5313     if( is_dc ) {
5314         if( cat == 0 )
5315             h->cbp_table[h->mb_xy] |= 0x100;
5316         else
5317             h->cbp_table[h->mb_xy] |= 0x40 << n;
5318     } else {
5319         if( cat == 5 )
5320             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5321         else {
5322             assert( cat == 1 || cat == 2 || cat == 4 );
5323             h->non_zero_count_cache[scan8[n]] = coeff_count;
5324         }
5325     }
5326
5327     do {
5328         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5329
5330         int j= scantable[index[--coeff_count]];
5331
5332         if( get_cabac( CC, ctx ) == 0 ) {
5333             node_ctx = coeff_abs_level_transition[0][node_ctx];
5334             if( is_dc ) {
5335                 block[j] = get_cabac_bypass_sign( CC, -1);
5336             }else{
5337                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5338             }
5339         } else {
5340             int coeff_abs = 2;
5341             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5342             node_ctx = coeff_abs_level_transition[1][node_ctx];
5343
5344             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5345                 coeff_abs++;
5346             }
5347
5348             if( coeff_abs >= 15 ) {
5349                 int j = 0;
5350                 while( get_cabac_bypass( CC ) ) {
5351                     j++;
5352                 }
5353
5354                 coeff_abs=1;
5355                 while( j-- ) {
5356                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5357                 }
5358                 coeff_abs+= 14;
5359             }
5360
5361             if( is_dc ) {
5362                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5363             }else{
5364                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5365             }
5366         }
5367     } while( coeff_count );
5368 #ifdef CABAC_ON_STACK
5369             h->cabac.range     = cc.range     ;
5370             h->cabac.low       = cc.low       ;
5371             h->cabac.bytestream= cc.bytestream;
5372 #endif
5373
5374 }
5375
5376 #if !CONFIG_SMALL
5377 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5378     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5379 }
5380
5381 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5382     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5383 }
5384 #endif
5385
5386 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5387 #if CONFIG_SMALL
5388     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5389 #else
5390     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5391     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5392 #endif
5393 }
5394
5395 static inline void compute_mb_neighbors(H264Context *h)
5396 {
5397     MpegEncContext * const s = &h->s;
5398     const int mb_xy  = h->mb_xy;
5399     h->top_mb_xy     = mb_xy - s->mb_stride;
5400     h->left_mb_xy[0] = mb_xy - 1;
5401     if(FRAME_MBAFF){
5402         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5403         const int top_pair_xy      = pair_xy     - s->mb_stride;
5404         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5405         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5406         const int curr_mb_field_flag = MB_FIELD;
5407         const int bottom = (s->mb_y & 1);
5408
5409         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5410             h->top_mb_xy -= s->mb_stride;
5411         }
5412         if (!left_mb_field_flag == curr_mb_field_flag) {
5413             h->left_mb_xy[0] = pair_xy - 1;
5414         }
5415     } else if (FIELD_PICTURE) {
5416         h->top_mb_xy -= s->mb_stride;
5417     }
5418     return;
5419 }
5420
5421 /**
5422  * decodes a macroblock
5423  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5424  */
5425 static int decode_mb_cabac(H264Context *h) {
5426     MpegEncContext * const s = &h->s;
5427     int mb_xy;
5428     int mb_type, partition_count, cbp = 0;
5429     int dct8x8_allowed= h->pps.transform_8x8_mode;
5430
5431     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5432
5433     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5434     if( h->slice_type_nos != FF_I_TYPE ) {
5435         int skip;
5436         /* a skipped mb needs the aff flag from the following mb */
5437         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5438             predict_field_decoding_flag(h);
5439         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5440             skip = h->next_mb_skipped;
5441         else
5442             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5443         /* read skip flags */
5444         if( skip ) {
5445             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5446                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5447                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5448                 if(!h->next_mb_skipped)
5449                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5450             }
5451
5452             decode_mb_skip(h);
5453
5454             h->cbp_table[mb_xy] = 0;
5455             h->chroma_pred_mode_table[mb_xy] = 0;
5456             h->last_qscale_diff = 0;
5457
5458             return 0;
5459
5460         }
5461     }
5462     if(FRAME_MBAFF){
5463         if( (s->mb_y&1) == 0 )
5464             h->mb_mbaff =
5465             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5466     }
5467
5468     h->prev_mb_skipped = 0;
5469
5470     compute_mb_neighbors(h);
5471
5472     if( h->slice_type_nos == FF_B_TYPE ) {
5473         mb_type = decode_cabac_mb_type_b( h );
5474         if( mb_type < 23 ){
5475             partition_count= b_mb_type_info[mb_type].partition_count;
5476             mb_type=         b_mb_type_info[mb_type].type;
5477         }else{
5478             mb_type -= 23;
5479             goto decode_intra_mb;
5480         }
5481     } else if( h->slice_type_nos == FF_P_TYPE ) {
5482         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5483             /* P-type */
5484             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5485                 /* P_L0_D16x16, P_8x8 */
5486                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5487             } else {
5488                 /* P_L0_D8x16, P_L0_D16x8 */
5489                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5490             }
5491             partition_count= p_mb_type_info[mb_type].partition_count;
5492             mb_type=         p_mb_type_info[mb_type].type;
5493         } else {
5494             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5495             goto decode_intra_mb;
5496         }
5497     } else {
5498         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5499         if(h->slice_type == FF_SI_TYPE && mb_type)
5500             mb_type--;
5501         assert(h->slice_type_nos == FF_I_TYPE);
5502 decode_intra_mb:
5503         partition_count = 0;
5504         cbp= i_mb_type_info[mb_type].cbp;
5505         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5506         mb_type= i_mb_type_info[mb_type].type;
5507     }
5508     if(MB_FIELD)
5509         mb_type |= MB_TYPE_INTERLACED;
5510
5511     h->slice_table[ mb_xy ]= h->slice_num;
5512
5513     if(IS_INTRA_PCM(mb_type)) {
5514         const uint8_t *ptr;
5515
5516         // We assume these blocks are very rare so we do not optimize it.
5517         // FIXME The two following lines get the bitstream position in the cabac
5518         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5519         ptr= h->cabac.bytestream;
5520         if(h->cabac.low&0x1) ptr--;
5521         if(CABAC_BITS==16){
5522             if(h->cabac.low&0x1FF) ptr--;
5523         }
5524
5525         // The pixels are stored in the same order as levels in h->mb array.
5526         memcpy(h->mb, ptr, 256); ptr+=256;
5527         if(CHROMA){
5528             memcpy(h->mb+128, ptr, 128); ptr+=128;
5529         }
5530
5531         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5532
5533         // All blocks are present
5534         h->cbp_table[mb_xy] = 0x1ef;
5535         h->chroma_pred_mode_table[mb_xy] = 0;
5536         // In deblocking, the quantizer is 0
5537         s->current_picture.qscale_table[mb_xy]= 0;
5538         // All coeffs are present
5539         memset(h->non_zero_count[mb_xy], 16, 16);
5540         s->current_picture.mb_type[mb_xy]= mb_type;
5541         h->last_qscale_diff = 0;
5542         return 0;
5543     }
5544
5545     if(MB_MBAFF){
5546         h->ref_count[0] <<= 1;
5547         h->ref_count[1] <<= 1;
5548     }
5549
5550     fill_caches(h, mb_type, 0);
5551
5552     if( IS_INTRA( mb_type ) ) {
5553         int i, pred_mode;
5554         if( IS_INTRA4x4( mb_type ) ) {
5555             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5556                 mb_type |= MB_TYPE_8x8DCT;
5557                 for( i = 0; i < 16; i+=4 ) {
5558                     int pred = pred_intra_mode( h, i );
5559                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5560                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5561                 }
5562             } else {
5563                 for( i = 0; i < 16; i++ ) {
5564                     int pred = pred_intra_mode( h, i );
5565                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5566
5567                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5568                 }
5569             }
5570             write_back_intra_pred_mode(h);
5571             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5572         } else {
5573             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5574             if( h->intra16x16_pred_mode < 0 ) return -1;
5575         }
5576         if(CHROMA){
5577             h->chroma_pred_mode_table[mb_xy] =
5578             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5579
5580             pred_mode= check_intra_pred_mode( h, pred_mode );
5581             if( pred_mode < 0 ) return -1;
5582             h->chroma_pred_mode= pred_mode;
5583         }
5584     } else if( partition_count == 4 ) {
5585         int i, j, sub_partition_count[4], list, ref[2][4];
5586
5587         if( h->slice_type_nos == FF_B_TYPE ) {
5588             for( i = 0; i < 4; i++ ) {
5589                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5590                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5591                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5592             }
5593             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5594                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5595                 pred_direct_motion(h, &mb_type);
5596                 h->ref_cache[0][scan8[4]] =
5597                 h->ref_cache[1][scan8[4]] =
5598                 h->ref_cache[0][scan8[12]] =
5599                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5600                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5601                     for( i = 0; i < 4; i++ )
5602                         if( IS_DIRECT(h->sub_mb_type[i]) )
5603                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5604                 }
5605             }
5606         } else {
5607             for( i = 0; i < 4; i++ ) {
5608                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5609                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5610                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5611             }
5612         }
5613
5614         for( list = 0; list < h->list_count; list++ ) {
5615                 for( i = 0; i < 4; i++ ) {
5616                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5617                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5618                         if( h->ref_count[list] > 1 ){
5619                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5620                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5621                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5622                                 return -1;
5623                             }
5624                         }else
5625                             ref[list][i] = 0;
5626                     } else {
5627                         ref[list][i] = -1;
5628                     }
5629                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5630                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5631                 }
5632         }
5633
5634         if(dct8x8_allowed)
5635             dct8x8_allowed = get_dct8x8_allowed(h);
5636
5637         for(list=0; list<h->list_count; list++){
5638             for(i=0; i<4; i++){
5639                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5640                 if(IS_DIRECT(h->sub_mb_type[i])){
5641                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5642                     continue;
5643                 }
5644
5645                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5646                     const int sub_mb_type= h->sub_mb_type[i];
5647                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5648                     for(j=0; j<sub_partition_count[i]; j++){
5649                         int mpx, mpy;
5650                         int mx, my;
5651                         const int index= 4*i + block_width*j;
5652                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5653                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5654                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5655
5656                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5657                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5658                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5659
5660                         if(IS_SUB_8X8(sub_mb_type)){
5661                             mv_cache[ 1 ][0]=
5662                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5663                             mv_cache[ 1 ][1]=
5664                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5665
5666                             mvd_cache[ 1 ][0]=
5667                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5668                             mvd_cache[ 1 ][1]=
5669                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5670                         }else if(IS_SUB_8X4(sub_mb_type)){
5671                             mv_cache[ 1 ][0]= mx;
5672                             mv_cache[ 1 ][1]= my;
5673
5674                             mvd_cache[ 1 ][0]= mx - mpx;
5675                             mvd_cache[ 1 ][1]= my - mpy;
5676                         }else if(IS_SUB_4X8(sub_mb_type)){
5677                             mv_cache[ 8 ][0]= mx;
5678                             mv_cache[ 8 ][1]= my;
5679
5680                             mvd_cache[ 8 ][0]= mx - mpx;
5681                             mvd_cache[ 8 ][1]= my - mpy;
5682                         }
5683                         mv_cache[ 0 ][0]= mx;
5684                         mv_cache[ 0 ][1]= my;
5685
5686                         mvd_cache[ 0 ][0]= mx - mpx;
5687                         mvd_cache[ 0 ][1]= my - mpy;
5688                     }
5689                 }else{
5690                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5691                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5692                     p[0] = p[1] = p[8] = p[9] = 0;
5693                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5694                 }
5695             }
5696         }
5697     } else if( IS_DIRECT(mb_type) ) {
5698         pred_direct_motion(h, &mb_type);
5699         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5700         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5701         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5702     } else {
5703         int list, mx, my, i, mpx, mpy;
5704         if(IS_16X16(mb_type)){
5705             for(list=0; list<h->list_count; list++){
5706                 if(IS_DIR(mb_type, 0, list)){
5707                     int ref;
5708                     if(h->ref_count[list] > 1){
5709                         ref= decode_cabac_mb_ref(h, list, 0);
5710                         if(ref >= (unsigned)h->ref_count[list]){
5711                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5712                             return -1;
5713                         }
5714                     }else
5715                         ref=0;
5716                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5717                 }else
5718                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5719             }
5720             for(list=0; list<h->list_count; list++){
5721                 if(IS_DIR(mb_type, 0, list)){
5722                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5723
5724                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5725                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5726                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5727
5728                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5729                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5730                 }else
5731                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5732             }
5733         }
5734         else if(IS_16X8(mb_type)){
5735             for(list=0; list<h->list_count; list++){
5736                     for(i=0; i<2; i++){
5737                         if(IS_DIR(mb_type, i, list)){
5738                             int ref;
5739                             if(h->ref_count[list] > 1){
5740                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5741                                 if(ref >= (unsigned)h->ref_count[list]){
5742                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5743                                     return -1;
5744                                 }
5745                             }else
5746                                 ref=0;
5747                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5748                         }else
5749                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5750                     }
5751             }
5752             for(list=0; list<h->list_count; list++){
5753                 for(i=0; i<2; i++){
5754                     if(IS_DIR(mb_type, i, list)){
5755                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5756                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5757                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5758                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5759
5760                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5761                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5762                     }else{
5763                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5764                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5765                     }
5766                 }
5767             }
5768         }else{
5769             assert(IS_8X16(mb_type));
5770             for(list=0; list<h->list_count; list++){
5771                     for(i=0; i<2; i++){
5772                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5773                             int ref;
5774                             if(h->ref_count[list] > 1){
5775                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5776                                 if(ref >= (unsigned)h->ref_count[list]){
5777                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5778                                     return -1;
5779                                 }
5780                             }else
5781                                 ref=0;
5782                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5783                         }else
5784                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5785                     }
5786             }
5787             for(list=0; list<h->list_count; list++){
5788                 for(i=0; i<2; i++){
5789                     if(IS_DIR(mb_type, i, list)){
5790                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5791                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5792                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5793
5794                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5795                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5796                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5797                     }else{
5798                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5799                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5800                     }
5801                 }
5802             }
5803         }
5804     }
5805
5806    if( IS_INTER( mb_type ) ) {
5807         h->chroma_pred_mode_table[mb_xy] = 0;
5808         write_back_motion( h, mb_type );
5809    }
5810
5811     if( !IS_INTRA16x16( mb_type ) ) {
5812         cbp  = decode_cabac_mb_cbp_luma( h );
5813         if(CHROMA)
5814             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5815     }
5816
5817     h->cbp_table[mb_xy] = h->cbp = cbp;
5818
5819     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5820         if( decode_cabac_mb_transform_size( h ) )
5821             mb_type |= MB_TYPE_8x8DCT;
5822     }
5823     s->current_picture.mb_type[mb_xy]= mb_type;
5824
5825     if( cbp || IS_INTRA16x16( mb_type ) ) {
5826         const uint8_t *scan, *scan8x8, *dc_scan;
5827         const uint32_t *qmul;
5828         int dqp;
5829
5830         if(IS_INTERLACED(mb_type)){
5831             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5832             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5833             dc_scan= luma_dc_field_scan;
5834         }else{
5835             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5836             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5837             dc_scan= luma_dc_zigzag_scan;
5838         }
5839
5840         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5841         if( dqp == INT_MIN ){
5842             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5843             return -1;
5844         }
5845         s->qscale += dqp;
5846         if(((unsigned)s->qscale) > 51){
5847             if(s->qscale<0) s->qscale+= 52;
5848             else            s->qscale-= 52;
5849         }
5850         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5851         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5852
5853         if( IS_INTRA16x16( mb_type ) ) {
5854             int i;
5855             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5856             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5857
5858             if( cbp&15 ) {
5859                 qmul = h->dequant4_coeff[0][s->qscale];
5860                 for( i = 0; i < 16; i++ ) {
5861                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5862                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5863                 }
5864             } else {
5865                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5866             }
5867         } else {
5868             int i8x8, i4x4;
5869             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5870                 if( cbp & (1<<i8x8) ) {
5871                     if( IS_8x8DCT(mb_type) ) {
5872                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5873                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5874                     } else {
5875                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5876                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5877                             const int index = 4*i8x8 + i4x4;
5878                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5879 //START_TIMER
5880                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5881 //STOP_TIMER("decode_residual")
5882                         }
5883                     }
5884                 } else {
5885                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5886                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5887                 }
5888             }
5889         }
5890
5891         if( cbp&0x30 ){
5892             int c;
5893             for( c = 0; c < 2; c++ ) {
5894                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5895                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5896             }
5897         }
5898
5899         if( cbp&0x20 ) {
5900             int c, i;
5901             for( c = 0; c < 2; c++ ) {
5902                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5903                 for( i = 0; i < 4; i++ ) {
5904                     const int index = 16 + 4 * c + i;
5905                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5906                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5907                 }
5908             }
5909         } else {
5910             uint8_t * const nnz= &h->non_zero_count_cache[0];
5911             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5912             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5913         }
5914     } else {
5915         uint8_t * const nnz= &h->non_zero_count_cache[0];
5916         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5917         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5918         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5919         h->last_qscale_diff = 0;
5920     }
5921
5922     s->current_picture.qscale_table[mb_xy]= s->qscale;
5923     write_back_non_zero_count(h);
5924
5925     if(MB_MBAFF){
5926         h->ref_count[0] >>= 1;
5927         h->ref_count[1] >>= 1;
5928     }
5929
5930     return 0;
5931 }
5932
5933
5934 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5935     const int index_a = qp + h->slice_alpha_c0_offset;
5936     const int alpha = (alpha_table+52)[index_a];
5937     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5938
5939     if( bS[0] < 4 ) {
5940         int8_t tc[4];
5941         tc[0] = (tc0_table+52)[index_a][bS[0]];
5942         tc[1] = (tc0_table+52)[index_a][bS[1]];
5943         tc[2] = (tc0_table+52)[index_a][bS[2]];
5944         tc[3] = (tc0_table+52)[index_a][bS[3]];
5945         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5946     } else {
5947         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5948     }
5949 }
5950 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5951     const int index_a = qp + h->slice_alpha_c0_offset;
5952     const int alpha = (alpha_table+52)[index_a];
5953     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5954
5955     if( bS[0] < 4 ) {
5956         int8_t tc[4];
5957         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5958         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5959         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5960         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5961         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5962     } else {
5963         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5964     }
5965 }
5966
5967 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5968     int i;
5969     for( i = 0; i < 16; i++, pix += stride) {
5970         int index_a;
5971         int alpha;
5972         int beta;
5973
5974         int qp_index;
5975         int bS_index = (i >> 1);
5976         if (!MB_FIELD) {
5977             bS_index &= ~1;
5978             bS_index |= (i & 1);
5979         }
5980
5981         if( bS[bS_index] == 0 ) {
5982             continue;
5983         }
5984
5985         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5986         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5987         alpha = (alpha_table+52)[index_a];
5988         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5989
5990         if( bS[bS_index] < 4 ) {
5991             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5992             const int p0 = pix[-1];
5993             const int p1 = pix[-2];
5994             const int p2 = pix[-3];
5995             const int q0 = pix[0];
5996             const int q1 = pix[1];
5997             const int q2 = pix[2];
5998
5999             if( FFABS( p0 - q0 ) < alpha &&
6000                 FFABS( p1 - p0 ) < beta &&
6001                 FFABS( q1 - q0 ) < beta ) {
6002                 int tc = tc0;
6003                 int i_delta;
6004
6005                 if( FFABS( p2 - p0 ) < beta ) {
6006                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6007                     tc++;
6008                 }
6009                 if( FFABS( q2 - q0 ) < beta ) {
6010                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6011                     tc++;
6012                 }
6013
6014                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6015                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6016                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6017                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6018             }
6019         }else{
6020             const int p0 = pix[-1];
6021             const int p1 = pix[-2];
6022             const int p2 = pix[-3];
6023
6024             const int q0 = pix[0];
6025             const int q1 = pix[1];
6026             const int q2 = pix[2];
6027
6028             if( FFABS( p0 - q0 ) < alpha &&
6029                 FFABS( p1 - p0 ) < beta &&
6030                 FFABS( q1 - q0 ) < beta ) {
6031
6032                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6033                     if( FFABS( p2 - p0 ) < beta)
6034                     {
6035                         const int p3 = pix[-4];
6036                         /* p0', p1', p2' */
6037                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6038                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6039                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6040                     } else {
6041                         /* p0' */
6042                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6043                     }
6044                     if( FFABS( q2 - q0 ) < beta)
6045                     {
6046                         const int q3 = pix[3];
6047                         /* q0', q1', q2' */
6048                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6049                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6050                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6051                     } else {
6052                         /* q0' */
6053                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6054                     }
6055                 }else{
6056                     /* p0', q0' */
6057                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6058                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6059                 }
6060                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6061             }
6062         }
6063     }
6064 }
6065 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6066     int i;
6067     for( i = 0; i < 8; i++, pix += stride) {
6068         int index_a;
6069         int alpha;
6070         int beta;
6071
6072         int qp_index;
6073         int bS_index = i;
6074
6075         if( bS[bS_index] == 0 ) {
6076             continue;
6077         }
6078
6079         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6080         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6081         alpha = (alpha_table+52)[index_a];
6082         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6083
6084         if( bS[bS_index] < 4 ) {
6085             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6086             const int p0 = pix[-1];
6087             const int p1 = pix[-2];
6088             const int q0 = pix[0];
6089             const int q1 = pix[1];
6090
6091             if( FFABS( p0 - q0 ) < alpha &&
6092                 FFABS( p1 - p0 ) < beta &&
6093                 FFABS( q1 - q0 ) < beta ) {
6094                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6095
6096                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6097                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6098                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6099             }
6100         }else{
6101             const int p0 = pix[-1];
6102             const int p1 = pix[-2];
6103             const int q0 = pix[0];
6104             const int q1 = pix[1];
6105
6106             if( FFABS( p0 - q0 ) < alpha &&
6107                 FFABS( p1 - p0 ) < beta &&
6108                 FFABS( q1 - q0 ) < beta ) {
6109
6110                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6111                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6112                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6113             }
6114         }
6115     }
6116 }
6117
6118 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6119     const int index_a = qp + h->slice_alpha_c0_offset;
6120     const int alpha = (alpha_table+52)[index_a];
6121     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6122
6123     if( bS[0] < 4 ) {
6124         int8_t tc[4];
6125         tc[0] = (tc0_table+52)[index_a][bS[0]];
6126         tc[1] = (tc0_table+52)[index_a][bS[1]];
6127         tc[2] = (tc0_table+52)[index_a][bS[2]];
6128         tc[3] = (tc0_table+52)[index_a][bS[3]];
6129         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6130     } else {
6131         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6132     }
6133 }
6134
6135 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6136     const int index_a = qp + h->slice_alpha_c0_offset;
6137     const int alpha = (alpha_table+52)[index_a];
6138     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6139
6140     if( bS[0] < 4 ) {
6141         int8_t tc[4];
6142         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6143         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6144         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6145         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6146         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6147     } else {
6148         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6149     }
6150 }
6151
6152 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6153     MpegEncContext * const s = &h->s;
6154     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6155     int mb_xy, mb_type;
6156     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6157
6158     mb_xy = h->mb_xy;
6159
6160     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6161         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6162        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6163                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6164         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6165         return;
6166     }
6167     assert(!FRAME_MBAFF);
6168
6169     mb_type = s->current_picture.mb_type[mb_xy];
6170     qp = s->current_picture.qscale_table[mb_xy];
6171     qp0 = s->current_picture.qscale_table[mb_xy-1];
6172     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6173     qpc = get_chroma_qp( h, 0, qp );
6174     qpc0 = get_chroma_qp( h, 0, qp0 );
6175     qpc1 = get_chroma_qp( h, 0, qp1 );
6176     qp0 = (qp + qp0 + 1) >> 1;
6177     qp1 = (qp + qp1 + 1) >> 1;
6178     qpc0 = (qpc + qpc0 + 1) >> 1;
6179     qpc1 = (qpc + qpc1 + 1) >> 1;
6180     qp_thresh = 15 - h->slice_alpha_c0_offset;
6181     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6182        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6183         return;
6184
6185     if( IS_INTRA(mb_type) ) {
6186         int16_t bS4[4] = {4,4,4,4};
6187         int16_t bS3[4] = {3,3,3,3};
6188         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6189         if( IS_8x8DCT(mb_type) ) {
6190             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6191             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6192             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6193             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6194         } else {
6195             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6196             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6197             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6198             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6199             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6200             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6201             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6202             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6203         }
6204         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6205         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6206         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6207         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6208         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6209         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6210         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6211         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6212         return;
6213     } else {
6214         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6215         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6216         int edges;
6217         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6218             edges = 4;
6219             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6220         } else {
6221             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6222                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6223             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6224                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6225                              ? 3 : 0;
6226             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6227             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6228             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6229                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6230         }
6231         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6232             bSv[0][0] = 0x0004000400040004ULL;
6233         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6234             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6235
6236 #define FILTER(hv,dir,edge)\
6237         if(bSv[dir][edge]) {\
6238             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6239             if(!(edge&1)) {\
6240                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6241                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6242             }\
6243         }
6244         if( edges == 1 ) {
6245             FILTER(v,0,0);
6246             FILTER(h,1,0);
6247         } else if( IS_8x8DCT(mb_type) ) {
6248             FILTER(v,0,0);
6249             FILTER(v,0,2);
6250             FILTER(h,1,0);
6251             FILTER(h,1,2);
6252         } else {
6253             FILTER(v,0,0);
6254             FILTER(v,0,1);
6255             FILTER(v,0,2);
6256             FILTER(v,0,3);
6257             FILTER(h,1,0);
6258             FILTER(h,1,1);
6259             FILTER(h,1,2);
6260             FILTER(h,1,3);
6261         }
6262 #undef FILTER
6263     }
6264 }
6265
6266
6267 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6268     MpegEncContext * const s = &h->s;
6269     int edge;
6270     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6271     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6272     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6273     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6274     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6275
6276     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6277                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6278     // how often to recheck mv-based bS when iterating between edges
6279     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6280                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6281     // how often to recheck mv-based bS when iterating along each edge
6282     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6283
6284     if (first_vertical_edge_done) {
6285         start = 1;
6286     }
6287
6288     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6289         start = 1;
6290
6291     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6292         && !IS_INTERLACED(mb_type)
6293         && IS_INTERLACED(mbm_type)
6294         ) {
6295         // This is a special case in the norm where the filtering must
6296         // be done twice (one each of the field) even if we are in a
6297         // frame macroblock.
6298         //
6299         static const int nnz_idx[4] = {4,5,6,3};
6300         unsigned int tmp_linesize   = 2 *   linesize;
6301         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6302         int mbn_xy = mb_xy - 2 * s->mb_stride;
6303         int qp;
6304         int i, j;
6305         int16_t bS[4];
6306
6307         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6308             if( IS_INTRA(mb_type) ||
6309                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6310                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6311             } else {
6312                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6313                 for( i = 0; i < 4; i++ ) {
6314                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6315                         mbn_nnz[nnz_idx[i]] != 0 )
6316                         bS[i] = 2;
6317                     else
6318                         bS[i] = 1;
6319                 }
6320             }
6321             // Do not use s->qscale as luma quantizer because it has not the same
6322             // value in IPCM macroblocks.
6323             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6324             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6325             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6326             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6327             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6328                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6329             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6330                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6331         }
6332
6333         start = 1;
6334     }
6335
6336     /* Calculate bS */
6337     for( edge = start; edge < edges; edge++ ) {
6338         /* mbn_xy: neighbor macroblock */
6339         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6340         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6341         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6342         int16_t bS[4];
6343         int qp;
6344
6345         if( (edge&1) && IS_8x8DCT(mb_type) )
6346             continue;
6347
6348         if( IS_INTRA(mb_type) ||
6349             IS_INTRA(mbn_type) ) {
6350             int value;
6351             if (edge == 0) {
6352                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6353                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6354                 ) {
6355                     value = 4;
6356                 } else {
6357                     value = 3;
6358                 }
6359             } else {
6360                 value = 3;
6361             }
6362             bS[0] = bS[1] = bS[2] = bS[3] = value;
6363         } else {
6364             int i, l;
6365             int mv_done;
6366
6367             if( edge & mask_edge ) {
6368                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6369                 mv_done = 1;
6370             }
6371             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6372                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6373                 mv_done = 1;
6374             }
6375             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6376                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6377                 int bn_idx= b_idx - (dir ? 8:1);
6378                 int v = 0;
6379
6380                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6381                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6382                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6383                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6384                 }
6385
6386                 if(h->slice_type_nos == FF_B_TYPE && v){
6387                     v=0;
6388                     for( l = 0; !v && l < 2; l++ ) {
6389                         int ln= 1-l;
6390                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6391                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6392                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6393                     }
6394                 }
6395
6396                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6397                 mv_done = 1;
6398             }
6399             else
6400                 mv_done = 0;
6401
6402             for( i = 0; i < 4; i++ ) {
6403                 int x = dir == 0 ? edge : i;
6404                 int y = dir == 0 ? i    : edge;
6405                 int b_idx= 8 + 4 + x + 8*y;
6406                 int bn_idx= b_idx - (dir ? 8:1);
6407
6408                 if( h->non_zero_count_cache[b_idx] |
6409                     h->non_zero_count_cache[bn_idx] ) {
6410                     bS[i] = 2;
6411                 }
6412                 else if(!mv_done)
6413                 {
6414                     bS[i] = 0;
6415                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6416                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6417                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6418                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6419                             bS[i] = 1;
6420                             break;
6421                         }
6422                     }
6423
6424                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6425                         bS[i] = 0;
6426                         for( l = 0; l < 2; l++ ) {
6427                             int ln= 1-l;
6428                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6429                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6430                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6431                                 bS[i] = 1;
6432                                 break;
6433                             }
6434                         }
6435                     }
6436                 }
6437             }
6438
6439             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6440                 continue;
6441         }
6442
6443         /* Filter edge */
6444         // Do not use s->qscale as luma quantizer because it has not the same
6445         // value in IPCM macroblocks.
6446         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6447         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6448         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6449         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6450         if( dir == 0 ) {
6451             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6452             if( (edge&1) == 0 ) {
6453                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6454                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6455                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6456                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6457             }
6458         } else {
6459             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6460             if( (edge&1) == 0 ) {
6461                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6462                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6463                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6464                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6465             }
6466         }
6467     }
6468 }
6469
6470 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6471     MpegEncContext * const s = &h->s;
6472     const int mb_xy= mb_x + mb_y*s->mb_stride;
6473     const int mb_type = s->current_picture.mb_type[mb_xy];
6474     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6475     int first_vertical_edge_done = 0;
6476     av_unused int dir;
6477
6478     //for sufficiently low qp, filtering wouldn't do anything
6479     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6480     if(!FRAME_MBAFF){
6481         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6482         int qp = s->current_picture.qscale_table[mb_xy];
6483         if(qp <= qp_thresh
6484            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6485            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6486             return;
6487         }
6488     }
6489
6490     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6491     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6492         int top_type, left_type[2];
6493         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6494         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6495         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6496
6497         if(IS_8x8DCT(top_type)){
6498             h->non_zero_count_cache[4+8*0]=
6499             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6500             h->non_zero_count_cache[6+8*0]=
6501             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6502         }
6503         if(IS_8x8DCT(left_type[0])){
6504             h->non_zero_count_cache[3+8*1]=
6505             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6506         }
6507         if(IS_8x8DCT(left_type[1])){
6508             h->non_zero_count_cache[3+8*3]=
6509             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6510         }
6511
6512         if(IS_8x8DCT(mb_type)){
6513             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6514             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6515
6516             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6517             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6518
6519             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6520             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6521
6522             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6523             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6524         }
6525     }
6526
6527     if (FRAME_MBAFF
6528             // left mb is in picture
6529             && h->slice_table[mb_xy-1] != 0xFFFF
6530             // and current and left pair do not have the same interlaced type
6531             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6532             // and left mb is in the same slice if deblocking_filter == 2
6533             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6534         /* First vertical edge is different in MBAFF frames
6535          * There are 8 different bS to compute and 2 different Qp
6536          */
6537         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6538         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6539         int16_t bS[8];
6540         int qp[2];
6541         int bqp[2];
6542         int rqp[2];
6543         int mb_qp, mbn0_qp, mbn1_qp;
6544         int i;
6545         first_vertical_edge_done = 1;
6546
6547         if( IS_INTRA(mb_type) )
6548             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6549         else {
6550             for( i = 0; i < 8; i++ ) {
6551                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6552
6553                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6554                     bS[i] = 4;
6555                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6556                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6557                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6558                                                                        :
6559                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6560                     bS[i] = 2;
6561                 else
6562                     bS[i] = 1;
6563             }
6564         }
6565
6566         mb_qp = s->current_picture.qscale_table[mb_xy];
6567         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6568         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6569         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6570         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6571                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6572         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6573                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6574         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6575         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6576                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6577         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6578                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6579
6580         /* Filter edge */
6581         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6582         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6583         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6584         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6585         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6586     }
6587
6588 #if CONFIG_SMALL
6589     for( dir = 0; dir < 2; dir++ )
6590         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6591 #else
6592     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6593     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6594 #endif
6595 }
6596
6597 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6598     H264Context *h = *(void**)arg;
6599     MpegEncContext * const s = &h->s;
6600     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6601
6602     s->mb_skip_run= -1;
6603
6604     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6605                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6606
6607     if( h->pps.cabac ) {
6608         int i;
6609
6610         /* realign */
6611         align_get_bits( &s->gb );
6612
6613         /* init cabac */
6614         ff_init_cabac_states( &h->cabac);
6615         ff_init_cabac_decoder( &h->cabac,
6616                                s->gb.buffer + get_bits_count(&s->gb)/8,
6617                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6618         /* calculate pre-state */
6619         for( i= 0; i < 460; i++ ) {
6620             int pre;
6621             if( h->slice_type_nos == FF_I_TYPE )
6622                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6623             else
6624                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6625
6626             if( pre <= 63 )
6627                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6628             else
6629                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6630         }
6631
6632         for(;;){
6633 //START_TIMER
6634             int ret = decode_mb_cabac(h);
6635             int eos;
6636 //STOP_TIMER("decode_mb_cabac")
6637
6638             if(ret>=0) hl_decode_mb(h);
6639
6640             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6641                 s->mb_y++;
6642
6643                 ret = decode_mb_cabac(h);
6644
6645                 if(ret>=0) hl_decode_mb(h);
6646                 s->mb_y--;
6647             }
6648             eos = get_cabac_terminate( &h->cabac );
6649
6650             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6651                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6652                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6653                 return -1;
6654             }
6655
6656             if( ++s->mb_x >= s->mb_width ) {
6657                 s->mb_x = 0;
6658                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6659                 ++s->mb_y;
6660                 if(FIELD_OR_MBAFF_PICTURE) {
6661                     ++s->mb_y;
6662                 }
6663             }
6664
6665             if( eos || s->mb_y >= s->mb_height ) {
6666                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6667                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6668                 return 0;
6669             }
6670         }
6671
6672     } else {
6673         for(;;){
6674             int ret = decode_mb_cavlc(h);
6675
6676             if(ret>=0) hl_decode_mb(h);
6677
6678             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6679                 s->mb_y++;
6680                 ret = decode_mb_cavlc(h);
6681
6682                 if(ret>=0) hl_decode_mb(h);
6683                 s->mb_y--;
6684             }
6685
6686             if(ret<0){
6687                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6688                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6689
6690                 return -1;
6691             }
6692
6693             if(++s->mb_x >= s->mb_width){
6694                 s->mb_x=0;
6695                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6696                 ++s->mb_y;
6697                 if(FIELD_OR_MBAFF_PICTURE) {
6698                     ++s->mb_y;
6699                 }
6700                 if(s->mb_y >= s->mb_height){
6701                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6702
6703                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6704                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6705
6706                         return 0;
6707                     }else{
6708                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6709
6710                         return -1;
6711                     }
6712                 }
6713             }
6714
6715             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6716                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6717                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6718                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6719
6720                     return 0;
6721                 }else{
6722                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6723
6724                     return -1;
6725                 }
6726             }
6727         }
6728     }
6729
6730 #if 0
6731     for(;s->mb_y < s->mb_height; s->mb_y++){
6732         for(;s->mb_x < s->mb_width; s->mb_x++){
6733             int ret= decode_mb(h);
6734
6735             hl_decode_mb(h);
6736
6737             if(ret<0){
6738                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6739                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6740
6741                 return -1;
6742             }
6743
6744             if(++s->mb_x >= s->mb_width){
6745                 s->mb_x=0;
6746                 if(++s->mb_y >= s->mb_height){
6747                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6748                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6749
6750                         return 0;
6751                     }else{
6752                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6753
6754                         return -1;
6755                     }
6756                 }
6757             }
6758
6759             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6760                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6761                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6762
6763                     return 0;
6764                 }else{
6765                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6766
6767                     return -1;
6768                 }
6769             }
6770         }
6771         s->mb_x=0;
6772         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6773     }
6774 #endif
6775     return -1; //not reached
6776 }
6777
6778 static int decode_picture_timing(H264Context *h){
6779     MpegEncContext * const s = &h->s;
6780     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6781         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6782         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6783     }
6784     if(h->sps.pic_struct_present_flag){
6785         unsigned int i, num_clock_ts;
6786         h->sei_pic_struct = get_bits(&s->gb, 4);
6787
6788         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6789             return -1;
6790
6791         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6792
6793         for (i = 0 ; i < num_clock_ts ; i++){
6794             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6795                 unsigned int full_timestamp_flag;
6796                 skip_bits(&s->gb, 2);                 /* ct_type */
6797                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6798                 skip_bits(&s->gb, 5);                 /* counting_type */
6799                 full_timestamp_flag = get_bits(&s->gb, 1);
6800                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6801                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6802                 skip_bits(&s->gb, 8);                 /* n_frames */
6803                 if(full_timestamp_flag){
6804                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6805                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6806                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6807                 }else{
6808                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6809                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6810                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6811                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6812                             if(get_bits(&s->gb, 1))   /* hours_flag */
6813                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6814                         }
6815                     }
6816                 }
6817                 if(h->sps.time_offset_length > 0)
6818                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6819             }
6820         }
6821     }
6822     return 0;
6823 }
6824
6825 static int decode_unregistered_user_data(H264Context *h, int size){
6826     MpegEncContext * const s = &h->s;
6827     uint8_t user_data[16+256];
6828     int e, build, i;
6829
6830     if(size<16)
6831         return -1;
6832
6833     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6834         user_data[i]= get_bits(&s->gb, 8);
6835     }
6836
6837     user_data[i]= 0;
6838     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6839     if(e==1 && build>=0)
6840         h->x264_build= build;
6841
6842     if(s->avctx->debug & FF_DEBUG_BUGS)
6843         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6844
6845     for(; i<size; i++)
6846         skip_bits(&s->gb, 8);
6847
6848     return 0;
6849 }
6850
6851 static int decode_recovery_point(H264Context *h){
6852     MpegEncContext * const s = &h->s;
6853
6854     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6855     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6856
6857     return 0;
6858 }
6859
6860 static int decode_buffering_period(H264Context *h){
6861     MpegEncContext * const s = &h->s;
6862     unsigned int sps_id;
6863     int sched_sel_idx;
6864     SPS *sps;
6865
6866     sps_id = get_ue_golomb_31(&s->gb);
6867     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6868         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6869         return -1;
6870     }
6871     sps = h->sps_buffers[sps_id];
6872
6873     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6874     if (sps->nal_hrd_parameters_present_flag) {
6875         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6876             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6877             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6878         }
6879     }
6880     if (sps->vcl_hrd_parameters_present_flag) {
6881         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6882             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6883             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6884         }
6885     }
6886
6887     h->sei_buffering_period_present = 1;
6888     return 0;
6889 }
6890
6891 int ff_h264_decode_sei(H264Context *h){
6892     MpegEncContext * const s = &h->s;
6893
6894     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6895         int size, type;
6896
6897         type=0;
6898         do{
6899             type+= show_bits(&s->gb, 8);
6900         }while(get_bits(&s->gb, 8) == 255);
6901
6902         size=0;
6903         do{
6904             size+= show_bits(&s->gb, 8);
6905         }while(get_bits(&s->gb, 8) == 255);
6906
6907         switch(type){
6908         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6909             if(decode_picture_timing(h) < 0)
6910                 return -1;
6911             break;
6912         case SEI_TYPE_USER_DATA_UNREGISTERED:
6913             if(decode_unregistered_user_data(h, size) < 0)
6914                 return -1;
6915             break;
6916         case SEI_TYPE_RECOVERY_POINT:
6917             if(decode_recovery_point(h) < 0)
6918                 return -1;
6919             break;
6920         case SEI_BUFFERING_PERIOD:
6921             if(decode_buffering_period(h) < 0)
6922                 return -1;
6923             break;
6924         default:
6925             skip_bits(&s->gb, 8*size);
6926         }
6927
6928         //FIXME check bits here
6929         align_get_bits(&s->gb);
6930     }
6931
6932     return 0;
6933 }
6934
6935 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6936     MpegEncContext * const s = &h->s;
6937     int cpb_count, i;
6938     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6939
6940     if(cpb_count > 32U){
6941         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6942         return -1;
6943     }
6944
6945     get_bits(&s->gb, 4); /* bit_rate_scale */
6946     get_bits(&s->gb, 4); /* cpb_size_scale */
6947     for(i=0; i<cpb_count; i++){
6948         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6949         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6950         get_bits1(&s->gb);     /* cbr_flag */
6951     }
6952     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6953     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6954     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6955     sps->time_offset_length = get_bits(&s->gb, 5);
6956     sps->cpb_cnt = cpb_count;
6957     return 0;
6958 }
6959
6960 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6961     MpegEncContext * const s = &h->s;
6962     int aspect_ratio_info_present_flag;
6963     unsigned int aspect_ratio_idc;
6964
6965     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6966
6967     if( aspect_ratio_info_present_flag ) {
6968         aspect_ratio_idc= get_bits(&s->gb, 8);
6969         if( aspect_ratio_idc == EXTENDED_SAR ) {
6970             sps->sar.num= get_bits(&s->gb, 16);
6971             sps->sar.den= get_bits(&s->gb, 16);
6972         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6973             sps->sar=  pixel_aspect[aspect_ratio_idc];
6974         }else{
6975             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6976             return -1;
6977         }
6978     }else{
6979         sps->sar.num=
6980         sps->sar.den= 0;
6981     }
6982 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6983
6984     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6985         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6986     }
6987
6988     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6989         get_bits(&s->gb, 3);    /* video_format */
6990         get_bits1(&s->gb);      /* video_full_range_flag */
6991         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6992             get_bits(&s->gb, 8); /* colour_primaries */
6993             get_bits(&s->gb, 8); /* transfer_characteristics */
6994             get_bits(&s->gb, 8); /* matrix_coefficients */
6995         }
6996     }
6997
6998     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6999         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7000         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7001     }
7002
7003     sps->timing_info_present_flag = get_bits1(&s->gb);
7004     if(sps->timing_info_present_flag){
7005         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7006         sps->time_scale = get_bits_long(&s->gb, 32);
7007         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7008     }
7009
7010     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7011     if(sps->nal_hrd_parameters_present_flag)
7012         if(decode_hrd_parameters(h, sps) < 0)
7013             return -1;
7014     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7015     if(sps->vcl_hrd_parameters_present_flag)
7016         if(decode_hrd_parameters(h, sps) < 0)
7017             return -1;
7018     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7019         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7020     sps->pic_struct_present_flag = get_bits1(&s->gb);
7021
7022     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7023     if(sps->bitstream_restriction_flag){
7024         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7025         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7026         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7027         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7028         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7029         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7030         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7031
7032         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7033             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7034             return -1;
7035         }
7036     }
7037
7038     return 0;
7039 }
7040
7041 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7042                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7043     MpegEncContext * const s = &h->s;
7044     int i, last = 8, next = 8;
7045     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7046     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7047         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7048     else
7049     for(i=0;i<size;i++){
7050         if(next)
7051             next = (last + get_se_golomb(&s->gb)) & 0xff;
7052         if(!i && !next){ /* matrix not written, we use the preset one */
7053             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7054             break;
7055         }
7056         last = factors[scan[i]] = next ? next : last;
7057     }
7058 }
7059
7060 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7061                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7062     MpegEncContext * const s = &h->s;
7063     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7064     const uint8_t *fallback[4] = {
7065         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7066         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7067         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7068         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7069     };
7070     if(get_bits1(&s->gb)){
7071         sps->scaling_matrix_present |= is_sps;
7072         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7073         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7074         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7075         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7076         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7077         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7078         if(is_sps || pps->transform_8x8_mode){
7079             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7080             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7081         }
7082     }
7083 }
7084
7085 int ff_h264_decode_seq_parameter_set(H264Context *h){
7086     MpegEncContext * const s = &h->s;
7087     int profile_idc, level_idc;
7088     unsigned int sps_id;
7089     int i;
7090     SPS *sps;
7091
7092     profile_idc= get_bits(&s->gb, 8);
7093     get_bits1(&s->gb);   //constraint_set0_flag
7094     get_bits1(&s->gb);   //constraint_set1_flag
7095     get_bits1(&s->gb);   //constraint_set2_flag
7096     get_bits1(&s->gb);   //constraint_set3_flag
7097     get_bits(&s->gb, 4); // reserved
7098     level_idc= get_bits(&s->gb, 8);
7099     sps_id= get_ue_golomb_31(&s->gb);
7100
7101     if(sps_id >= MAX_SPS_COUNT) {
7102         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7103         return -1;
7104     }
7105     sps= av_mallocz(sizeof(SPS));
7106     if(sps == NULL)
7107         return -1;
7108
7109     sps->profile_idc= profile_idc;
7110     sps->level_idc= level_idc;
7111
7112     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7113     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7114     sps->scaling_matrix_present = 0;
7115
7116     if(sps->profile_idc >= 100){ //high profile
7117         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7118         if(sps->chroma_format_idc == 3)
7119             sps->residual_color_transform_flag = get_bits1(&s->gb);
7120         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7121         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7122         sps->transform_bypass = get_bits1(&s->gb);
7123         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7124     }else{
7125         sps->chroma_format_idc= 1;
7126     }
7127
7128     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7129     sps->poc_type= get_ue_golomb_31(&s->gb);
7130
7131     if(sps->poc_type == 0){ //FIXME #define
7132         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7133     } else if(sps->poc_type == 1){//FIXME #define
7134         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7135         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7136         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7137         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7138
7139         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7140             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7141             goto fail;
7142         }
7143
7144         for(i=0; i<sps->poc_cycle_length; i++)
7145             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7146     }else if(sps->poc_type != 2){
7147         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7148         goto fail;
7149     }
7150
7151     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7152     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7153         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7154         goto fail;
7155     }
7156     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7157     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7158     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7159     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7160        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7161         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7162         goto fail;
7163     }
7164
7165     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7166     if(!sps->frame_mbs_only_flag)
7167         sps->mb_aff= get_bits1(&s->gb);
7168     else
7169         sps->mb_aff= 0;
7170
7171     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7172
7173 #ifndef ALLOW_INTERLACE
7174     if(sps->mb_aff)
7175         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7176 #endif
7177     sps->crop= get_bits1(&s->gb);
7178     if(sps->crop){
7179         sps->crop_left  = get_ue_golomb(&s->gb);
7180         sps->crop_right = get_ue_golomb(&s->gb);
7181         sps->crop_top   = get_ue_golomb(&s->gb);
7182         sps->crop_bottom= get_ue_golomb(&s->gb);
7183         if(sps->crop_left || sps->crop_top){
7184             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7185         }
7186         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7187             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7188         }
7189     }else{
7190         sps->crop_left  =
7191         sps->crop_right =
7192         sps->crop_top   =
7193         sps->crop_bottom= 0;
7194     }
7195
7196     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7197     if( sps->vui_parameters_present_flag )
7198         decode_vui_parameters(h, sps);
7199
7200     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7201         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7202                sps_id, sps->profile_idc, sps->level_idc,
7203                sps->poc_type,
7204                sps->ref_frame_count,
7205                sps->mb_width, sps->mb_height,
7206                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7207                sps->direct_8x8_inference_flag ? "8B8" : "",
7208                sps->crop_left, sps->crop_right,
7209                sps->crop_top, sps->crop_bottom,
7210                sps->vui_parameters_present_flag ? "VUI" : "",
7211                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7212                );
7213     }
7214
7215     av_free(h->sps_buffers[sps_id]);
7216     h->sps_buffers[sps_id]= sps;
7217     h->sps = *sps;
7218     return 0;
7219 fail:
7220     av_free(sps);
7221     return -1;
7222 }
7223
7224 static void
7225 build_qp_table(PPS *pps, int t, int index)
7226 {
7227     int i;
7228     for(i = 0; i < 52; i++)
7229         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7230 }
7231
7232 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7233     MpegEncContext * const s = &h->s;
7234     unsigned int pps_id= get_ue_golomb(&s->gb);
7235     PPS *pps;
7236
7237     if(pps_id >= MAX_PPS_COUNT) {
7238         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7239         return -1;
7240     }
7241
7242     pps= av_mallocz(sizeof(PPS));
7243     if(pps == NULL)
7244         return -1;
7245     pps->sps_id= get_ue_golomb_31(&s->gb);
7246     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7247         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7248         goto fail;
7249     }
7250
7251     pps->cabac= get_bits1(&s->gb);
7252     pps->pic_order_present= get_bits1(&s->gb);
7253     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7254     if(pps->slice_group_count > 1 ){
7255         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7256         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7257         switch(pps->mb_slice_group_map_type){
7258         case 0:
7259 #if 0
7260 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7261 |    run_length[ i ]                                |1  |ue(v)   |
7262 #endif
7263             break;
7264         case 2:
7265 #if 0
7266 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7267 |{                                                  |   |        |
7268 |    top_left_mb[ i ]                               |1  |ue(v)   |
7269 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7270 |   }                                               |   |        |
7271 #endif
7272             break;
7273         case 3:
7274         case 4:
7275         case 5:
7276 #if 0
7277 |   slice_group_change_direction_flag               |1  |u(1)    |
7278 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7279 #endif
7280             break;
7281         case 6:
7282 #if 0
7283 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7284 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7285 |)                                                  |   |        |
7286 |    slice_group_id[ i ]                            |1  |u(v)    |
7287 #endif
7288             break;
7289         }
7290     }
7291     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7292     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7293     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7294         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7295         goto fail;
7296     }
7297
7298     pps->weighted_pred= get_bits1(&s->gb);
7299     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7300     pps->init_qp= get_se_golomb(&s->gb) + 26;
7301     pps->init_qs= get_se_golomb(&s->gb) + 26;
7302     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7303     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7304     pps->constrained_intra_pred= get_bits1(&s->gb);
7305     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7306
7307     pps->transform_8x8_mode= 0;
7308     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7309     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7310     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7311
7312     if(get_bits_count(&s->gb) < bit_length){
7313         pps->transform_8x8_mode= get_bits1(&s->gb);
7314         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7315         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7316     } else {
7317         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7318     }
7319
7320     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7321     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7322     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7323         h->pps.chroma_qp_diff= 1;
7324
7325     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7326         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7327                pps_id, pps->sps_id,
7328                pps->cabac ? "CABAC" : "CAVLC",
7329                pps->slice_group_count,
7330                pps->ref_count[0], pps->ref_count[1],
7331                pps->weighted_pred ? "weighted" : "",
7332                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7333                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7334                pps->constrained_intra_pred ? "CONSTR" : "",
7335                pps->redundant_pic_cnt_present ? "REDU" : "",
7336                pps->transform_8x8_mode ? "8x8DCT" : ""
7337                );
7338     }
7339
7340     av_free(h->pps_buffers[pps_id]);
7341     h->pps_buffers[pps_id]= pps;
7342     return 0;
7343 fail:
7344     av_free(pps);
7345     return -1;
7346 }
7347
7348 /**
7349  * Call decode_slice() for each context.
7350  *
7351  * @param h h264 master context
7352  * @param context_count number of contexts to execute
7353  */
7354 static void execute_decode_slices(H264Context *h, int context_count){
7355     MpegEncContext * const s = &h->s;
7356     AVCodecContext * const avctx= s->avctx;
7357     H264Context *hx;
7358     int i;
7359
7360     if (s->avctx->hwaccel)
7361         return;
7362     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7363         return;
7364     if(context_count == 1) {
7365         decode_slice(avctx, &h);
7366     } else {
7367         for(i = 1; i < context_count; i++) {
7368             hx = h->thread_context[i];
7369             hx->s.error_recognition = avctx->error_recognition;
7370             hx->s.error_count = 0;
7371         }
7372
7373         avctx->execute(avctx, (void *)decode_slice,
7374                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7375
7376         /* pull back stuff from slices to master context */
7377         hx = h->thread_context[context_count - 1];
7378         s->mb_x = hx->s.mb_x;
7379         s->mb_y = hx->s.mb_y;
7380         s->dropable = hx->s.dropable;
7381         s->picture_structure = hx->s.picture_structure;
7382         for(i = 1; i < context_count; i++)
7383             h->s.error_count += h->thread_context[i]->s.error_count;
7384     }
7385 }
7386
7387
7388 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7389     MpegEncContext * const s = &h->s;
7390     AVCodecContext * const avctx= s->avctx;
7391     int buf_index=0;
7392     H264Context *hx; ///< thread context
7393     int context_count = 0;
7394
7395     h->max_contexts = avctx->thread_count;
7396 #if 0
7397     int i;
7398     for(i=0; i<50; i++){
7399         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7400     }
7401 #endif
7402     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7403         h->current_slice = 0;
7404         if (!s->first_field)
7405             s->current_picture_ptr= NULL;
7406     }
7407
7408     for(;;){
7409         int consumed;
7410         int dst_length;
7411         int bit_length;
7412         const uint8_t *ptr;
7413         int i, nalsize = 0;
7414         int err;
7415
7416         if(h->is_avc) {
7417             if(buf_index >= buf_size) break;
7418             nalsize = 0;
7419             for(i = 0; i < h->nal_length_size; i++)
7420                 nalsize = (nalsize << 8) | buf[buf_index++];
7421             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7422                 if(nalsize == 1){
7423                     buf_index++;
7424                     continue;
7425                 }else{
7426                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7427                     break;
7428                 }
7429             }
7430         } else {
7431             // start code prefix search
7432             for(; buf_index + 3 < buf_size; buf_index++){
7433                 // This should always succeed in the first iteration.
7434                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7435                     break;
7436             }
7437
7438             if(buf_index+3 >= buf_size) break;
7439
7440             buf_index+=3;
7441         }
7442
7443         hx = h->thread_context[context_count];
7444
7445         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7446         if (ptr==NULL || dst_length < 0){
7447             return -1;
7448         }
7449         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7450             dst_length--;
7451         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7452
7453         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7454             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7455         }
7456
7457         if (h->is_avc && (nalsize != consumed)){
7458             int i, debug_level = AV_LOG_DEBUG;
7459             for (i = consumed; i < nalsize; i++)
7460                 if (buf[buf_index+i])
7461                     debug_level = AV_LOG_ERROR;
7462             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7463             consumed= nalsize;
7464         }
7465
7466         buf_index += consumed;
7467
7468         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7469            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7470             continue;
7471
7472       again:
7473         err = 0;
7474         switch(hx->nal_unit_type){
7475         case NAL_IDR_SLICE:
7476             if (h->nal_unit_type != NAL_IDR_SLICE) {
7477                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7478                 return -1;
7479             }
7480             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7481         case NAL_SLICE:
7482             init_get_bits(&hx->s.gb, ptr, bit_length);
7483             hx->intra_gb_ptr=
7484             hx->inter_gb_ptr= &hx->s.gb;
7485             hx->s.data_partitioning = 0;
7486
7487             if((err = decode_slice_header(hx, h)))
7488                break;
7489
7490             s->current_picture_ptr->key_frame |=
7491                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7492                     (h->sei_recovery_frame_cnt >= 0);
7493             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7494                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7495                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7496                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7497                && avctx->skip_frame < AVDISCARD_ALL){
7498                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7499                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7500                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7501                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7502                 }else
7503                     context_count++;
7504             }
7505             break;
7506         case NAL_DPA:
7507             init_get_bits(&hx->s.gb, ptr, bit_length);
7508             hx->intra_gb_ptr=
7509             hx->inter_gb_ptr= NULL;
7510             hx->s.data_partitioning = 1;
7511
7512             err = decode_slice_header(hx, h);
7513             break;
7514         case NAL_DPB:
7515             init_get_bits(&hx->intra_gb, ptr, bit_length);
7516             hx->intra_gb_ptr= &hx->intra_gb;
7517             break;
7518         case NAL_DPC:
7519             init_get_bits(&hx->inter_gb, ptr, bit_length);
7520             hx->inter_gb_ptr= &hx->inter_gb;
7521
7522             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7523                && s->context_initialized
7524                && s->hurry_up < 5
7525                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7526                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7527                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7528                && avctx->skip_frame < AVDISCARD_ALL)
7529                 context_count++;
7530             break;
7531         case NAL_SEI:
7532             init_get_bits(&s->gb, ptr, bit_length);
7533             ff_h264_decode_sei(h);
7534             break;
7535         case NAL_SPS:
7536             init_get_bits(&s->gb, ptr, bit_length);
7537             ff_h264_decode_seq_parameter_set(h);
7538
7539             if(s->flags& CODEC_FLAG_LOW_DELAY)
7540                 s->low_delay=1;
7541
7542             if(avctx->has_b_frames < 2)
7543                 avctx->has_b_frames= !s->low_delay;
7544             break;
7545         case NAL_PPS:
7546             init_get_bits(&s->gb, ptr, bit_length);
7547
7548             ff_h264_decode_picture_parameter_set(h, bit_length);
7549
7550             break;
7551         case NAL_AUD:
7552         case NAL_END_SEQUENCE:
7553         case NAL_END_STREAM:
7554         case NAL_FILLER_DATA:
7555         case NAL_SPS_EXT:
7556         case NAL_AUXILIARY_SLICE:
7557             break;
7558         default:
7559             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7560         }
7561
7562         if(context_count == h->max_contexts) {
7563             execute_decode_slices(h, context_count);
7564             context_count = 0;
7565         }
7566
7567         if (err < 0)
7568             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7569         else if(err == 1) {
7570             /* Slice could not be decoded in parallel mode, copy down
7571              * NAL unit stuff to context 0 and restart. Note that
7572              * rbsp_buffer is not transferred, but since we no longer
7573              * run in parallel mode this should not be an issue. */
7574             h->nal_unit_type = hx->nal_unit_type;
7575             h->nal_ref_idc   = hx->nal_ref_idc;
7576             hx = h;
7577             goto again;
7578         }
7579     }
7580     if(context_count)
7581         execute_decode_slices(h, context_count);
7582     return buf_index;
7583 }
7584
7585 /**
7586  * returns the number of bytes consumed for building the current frame
7587  */
7588 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7589         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7590         if(pos+10>buf_size) pos=buf_size; // oops ;)
7591
7592         return pos;
7593 }
7594
7595 static int decode_frame(AVCodecContext *avctx,
7596                              void *data, int *data_size,
7597                              const uint8_t *buf, int buf_size)
7598 {
7599     H264Context *h = avctx->priv_data;
7600     MpegEncContext *s = &h->s;
7601     AVFrame *pict = data;
7602     int buf_index;
7603
7604     s->flags= avctx->flags;
7605     s->flags2= avctx->flags2;
7606
7607    /* end of stream, output what is still in the buffers */
7608     if (buf_size == 0) {
7609         Picture *out;
7610         int i, out_idx;
7611
7612 //FIXME factorize this with the output code below
7613         out = h->delayed_pic[0];
7614         out_idx = 0;
7615         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7616             if(h->delayed_pic[i]->poc < out->poc){
7617                 out = h->delayed_pic[i];
7618                 out_idx = i;
7619             }
7620
7621         for(i=out_idx; h->delayed_pic[i]; i++)
7622             h->delayed_pic[i] = h->delayed_pic[i+1];
7623
7624         if(out){
7625             *data_size = sizeof(AVFrame);
7626             *pict= *(AVFrame*)out;
7627         }
7628
7629         return 0;
7630     }
7631
7632     if(h->is_avc && !h->got_avcC) {
7633         int i, cnt, nalsize;
7634         unsigned char *p = avctx->extradata;
7635         if(avctx->extradata_size < 7) {
7636             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7637             return -1;
7638         }
7639         if(*p != 1) {
7640             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7641             return -1;
7642         }
7643         /* sps and pps in the avcC always have length coded with 2 bytes,
7644            so put a fake nal_length_size = 2 while parsing them */
7645         h->nal_length_size = 2;
7646         // Decode sps from avcC
7647         cnt = *(p+5) & 0x1f; // Number of sps
7648         p += 6;
7649         for (i = 0; i < cnt; i++) {
7650             nalsize = AV_RB16(p) + 2;
7651             if(decode_nal_units(h, p, nalsize) < 0) {
7652                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7653                 return -1;
7654             }
7655             p += nalsize;
7656         }
7657         // Decode pps from avcC
7658         cnt = *(p++); // Number of pps
7659         for (i = 0; i < cnt; i++) {
7660             nalsize = AV_RB16(p) + 2;
7661             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7662                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7663                 return -1;
7664             }
7665             p += nalsize;
7666         }
7667         // Now store right nal length size, that will be use to parse all other nals
7668         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7669         // Do not reparse avcC
7670         h->got_avcC = 1;
7671     }
7672
7673     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7674         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7675             return -1;
7676         h->got_avcC = 1;
7677     }
7678
7679     buf_index=decode_nal_units(h, buf, buf_size);
7680     if(buf_index < 0)
7681         return -1;
7682
7683     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7684         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7685         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7686         return -1;
7687     }
7688
7689     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7690         Picture *out = s->current_picture_ptr;
7691         Picture *cur = s->current_picture_ptr;
7692         int i, pics, cross_idr, out_of_order, out_idx;
7693
7694         s->mb_y= 0;
7695
7696         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7697         s->current_picture_ptr->pict_type= s->pict_type;
7698
7699         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7700             ff_vdpau_h264_set_reference_frames(s);
7701
7702         if(!s->dropable) {
7703             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7704             h->prev_poc_msb= h->poc_msb;
7705             h->prev_poc_lsb= h->poc_lsb;
7706         }
7707         h->prev_frame_num_offset= h->frame_num_offset;
7708         h->prev_frame_num= h->frame_num;
7709
7710         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7711             ff_vdpau_h264_picture_complete(s);
7712
7713         /*
7714          * FIXME: Error handling code does not seem to support interlaced
7715          * when slices span multiple rows
7716          * The ff_er_add_slice calls don't work right for bottom
7717          * fields; they cause massive erroneous error concealing
7718          * Error marking covers both fields (top and bottom).
7719          * This causes a mismatched s->error_count
7720          * and a bad error table. Further, the error count goes to
7721          * INT_MAX when called for bottom field, because mb_y is
7722          * past end by one (callers fault) and resync_mb_y != 0
7723          * causes problems for the first MB line, too.
7724          */
7725         if (!FIELD_PICTURE)
7726             ff_er_frame_end(s);
7727
7728         MPV_frame_end(s);
7729         h->sei_recovery_frame_cnt = -1;
7730         h->sei_dpb_output_delay = 0;
7731         h->sei_cpb_removal_delay = -1;
7732         h->sei_buffering_period_present = 0;
7733
7734         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7735             /* Wait for second field. */
7736             *data_size = 0;
7737
7738         } else {
7739             cur->repeat_pict = 0;
7740
7741             /* Signal interlacing information externally. */
7742             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7743             if(h->sps.pic_struct_present_flag){
7744                 switch (h->sei_pic_struct)
7745                 {
7746                 case SEI_PIC_STRUCT_FRAME:
7747                     cur->interlaced_frame = 0;
7748                     break;
7749                 case SEI_PIC_STRUCT_TOP_FIELD:
7750                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7751                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7752                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7753                     cur->interlaced_frame = 1;
7754                     break;
7755                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7756                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7757                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7758                     // From these hints, let the applications decide if they apply deinterlacing.
7759                     cur->repeat_pict = 1;
7760                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7761                     break;
7762                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7763                     // Force progressive here, as doubling interlaced frame is a bad idea.
7764                     cur->interlaced_frame = 0;
7765                     cur->repeat_pict = 2;
7766                     break;
7767                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7768                     cur->interlaced_frame = 0;
7769                     cur->repeat_pict = 4;
7770                     break;
7771                 }
7772             }else{
7773                 /* Derive interlacing flag from used decoding process. */
7774                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7775             }
7776
7777             if (cur->field_poc[0] != cur->field_poc[1]){
7778                 /* Derive top_field_first from field pocs. */
7779                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7780             }else{
7781                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7782                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7783                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7784                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7785                         cur->top_field_first = 1;
7786                     else
7787                         cur->top_field_first = 0;
7788                 }else{
7789                     /* Most likely progressive */
7790                     cur->top_field_first = 0;
7791                 }
7792             }
7793
7794         //FIXME do something with unavailable reference frames
7795
7796             /* Sort B-frames into display order */
7797
7798             if(h->sps.bitstream_restriction_flag
7799                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7800                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7801                 s->low_delay = 0;
7802             }
7803
7804             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7805                && !h->sps.bitstream_restriction_flag){
7806                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7807                 s->low_delay= 0;
7808             }
7809
7810             pics = 0;
7811             while(h->delayed_pic[pics]) pics++;
7812
7813             assert(pics <= MAX_DELAYED_PIC_COUNT);
7814
7815             h->delayed_pic[pics++] = cur;
7816             if(cur->reference == 0)
7817                 cur->reference = DELAYED_PIC_REF;
7818
7819             out = h->delayed_pic[0];
7820             out_idx = 0;
7821             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7822                 if(h->delayed_pic[i]->poc < out->poc){
7823                     out = h->delayed_pic[i];
7824                     out_idx = i;
7825                 }
7826             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7827
7828             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7829
7830             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7831                 { }
7832             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7833                || (s->low_delay &&
7834                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7835                  || cur->pict_type == FF_B_TYPE)))
7836             {
7837                 s->low_delay = 0;
7838                 s->avctx->has_b_frames++;
7839             }
7840
7841             if(out_of_order || pics > s->avctx->has_b_frames){
7842                 out->reference &= ~DELAYED_PIC_REF;
7843                 for(i=out_idx; h->delayed_pic[i]; i++)
7844                     h->delayed_pic[i] = h->delayed_pic[i+1];
7845             }
7846             if(!out_of_order && pics > s->avctx->has_b_frames){
7847                 *data_size = sizeof(AVFrame);
7848
7849                 h->outputed_poc = out->poc;
7850                 *pict= *(AVFrame*)out;
7851             }else{
7852                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7853             }
7854         }
7855     }
7856
7857     assert(pict->data[0] || !*data_size);
7858     ff_print_debug_info(s, pict);
7859 //printf("out %d\n", (int)pict->data[0]);
7860 #if 0 //?
7861
7862     /* Return the Picture timestamp as the frame number */
7863     /* we subtract 1 because it is added on utils.c     */
7864     avctx->frame_number = s->picture_number - 1;
7865 #endif
7866     return get_consumed_bytes(s, buf_index, buf_size);
7867 }
7868 #if 0
7869 static inline void fill_mb_avail(H264Context *h){
7870     MpegEncContext * const s = &h->s;
7871     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7872
7873     if(s->mb_y){
7874         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7875         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7876         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7877     }else{
7878         h->mb_avail[0]=
7879         h->mb_avail[1]=
7880         h->mb_avail[2]= 0;
7881     }
7882     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7883     h->mb_avail[4]= 1; //FIXME move out
7884     h->mb_avail[5]= 0; //FIXME move out
7885 }
7886 #endif
7887
7888 #ifdef TEST
7889 #undef printf
7890 #undef random
7891 #define COUNT 8000
7892 #define SIZE (COUNT*40)
7893 int main(void){
7894     int i;
7895     uint8_t temp[SIZE];
7896     PutBitContext pb;
7897     GetBitContext gb;
7898 //    int int_temp[10000];
7899     DSPContext dsp;
7900     AVCodecContext avctx;
7901
7902     dsputil_init(&dsp, &avctx);
7903
7904     init_put_bits(&pb, temp, SIZE);
7905     printf("testing unsigned exp golomb\n");
7906     for(i=0; i<COUNT; i++){
7907         START_TIMER
7908         set_ue_golomb(&pb, i);
7909         STOP_TIMER("set_ue_golomb");
7910     }
7911     flush_put_bits(&pb);
7912
7913     init_get_bits(&gb, temp, 8*SIZE);
7914     for(i=0; i<COUNT; i++){
7915         int j, s;
7916
7917         s= show_bits(&gb, 24);
7918
7919         START_TIMER
7920         j= get_ue_golomb(&gb);
7921         if(j != i){
7922             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7923 //            return -1;
7924         }
7925         STOP_TIMER("get_ue_golomb");
7926     }
7927
7928
7929     init_put_bits(&pb, temp, SIZE);
7930     printf("testing signed exp golomb\n");
7931     for(i=0; i<COUNT; i++){
7932         START_TIMER
7933         set_se_golomb(&pb, i - COUNT/2);
7934         STOP_TIMER("set_se_golomb");
7935     }
7936     flush_put_bits(&pb);
7937
7938     init_get_bits(&gb, temp, 8*SIZE);
7939     for(i=0; i<COUNT; i++){
7940         int j, s;
7941
7942         s= show_bits(&gb, 24);
7943
7944         START_TIMER
7945         j= get_se_golomb(&gb);
7946         if(j != i - COUNT/2){
7947             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7948 //            return -1;
7949         }
7950         STOP_TIMER("get_se_golomb");
7951     }
7952
7953 #if 0
7954     printf("testing 4x4 (I)DCT\n");
7955
7956     DCTELEM block[16];
7957     uint8_t src[16], ref[16];
7958     uint64_t error= 0, max_error=0;
7959
7960     for(i=0; i<COUNT; i++){
7961         int j;
7962 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7963         for(j=0; j<16; j++){
7964             ref[j]= random()%255;
7965             src[j]= random()%255;
7966         }
7967
7968         h264_diff_dct_c(block, src, ref, 4);
7969
7970         //normalize
7971         for(j=0; j<16; j++){
7972 //            printf("%d ", block[j]);
7973             block[j]= block[j]*4;
7974             if(j&1) block[j]= (block[j]*4 + 2)/5;
7975             if(j&4) block[j]= (block[j]*4 + 2)/5;
7976         }
7977 //        printf("\n");
7978
7979         s->dsp.h264_idct_add(ref, block, 4);
7980 /*        for(j=0; j<16; j++){
7981             printf("%d ", ref[j]);
7982         }
7983         printf("\n");*/
7984
7985         for(j=0; j<16; j++){
7986             int diff= FFABS(src[j] - ref[j]);
7987
7988             error+= diff*diff;
7989             max_error= FFMAX(max_error, diff);
7990         }
7991     }
7992     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7993     printf("testing quantizer\n");
7994     for(qp=0; qp<52; qp++){
7995         for(i=0; i<16; i++)
7996             src1_block[i]= src2_block[i]= random()%255;
7997
7998     }
7999     printf("Testing NAL layer\n");
8000
8001     uint8_t bitstream[COUNT];
8002     uint8_t nal[COUNT*2];
8003     H264Context h;
8004     memset(&h, 0, sizeof(H264Context));
8005
8006     for(i=0; i<COUNT; i++){
8007         int zeros= i;
8008         int nal_length;
8009         int consumed;
8010         int out_length;
8011         uint8_t *out;
8012         int j;
8013
8014         for(j=0; j<COUNT; j++){
8015             bitstream[j]= (random() % 255) + 1;
8016         }
8017
8018         for(j=0; j<zeros; j++){
8019             int pos= random() % COUNT;
8020             while(bitstream[pos] == 0){
8021                 pos++;
8022                 pos %= COUNT;
8023             }
8024             bitstream[pos]=0;
8025         }
8026
8027         START_TIMER
8028
8029         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8030         if(nal_length<0){
8031             printf("encoding failed\n");
8032             return -1;
8033         }
8034
8035         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8036
8037         STOP_TIMER("NAL")
8038
8039         if(out_length != COUNT){
8040             printf("incorrect length %d %d\n", out_length, COUNT);
8041             return -1;
8042         }
8043
8044         if(consumed != nal_length){
8045             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8046             return -1;
8047         }
8048
8049         if(memcmp(bitstream, out, COUNT)){
8050             printf("mismatch\n");
8051             return -1;
8052         }
8053     }
8054 #endif
8055
8056     printf("Testing RBSP\n");
8057
8058
8059     return 0;
8060 }
8061 #endif /* TEST */
8062
8063
8064 static av_cold int decode_end(AVCodecContext *avctx)
8065 {
8066     H264Context *h = avctx->priv_data;
8067     MpegEncContext *s = &h->s;
8068     int i;
8069
8070     av_freep(&h->rbsp_buffer[0]);
8071     av_freep(&h->rbsp_buffer[1]);
8072     free_tables(h); //FIXME cleanup init stuff perhaps
8073
8074     for(i = 0; i < MAX_SPS_COUNT; i++)
8075         av_freep(h->sps_buffers + i);
8076
8077     for(i = 0; i < MAX_PPS_COUNT; i++)
8078         av_freep(h->pps_buffers + i);
8079
8080     MPV_common_end(s);
8081
8082 //    memset(h, 0, sizeof(H264Context));
8083
8084     return 0;
8085 }
8086
8087
8088 AVCodec h264_decoder = {
8089     "h264",
8090     CODEC_TYPE_VIDEO,
8091     CODEC_ID_H264,
8092     sizeof(H264Context),
8093     decode_init,
8094     NULL,
8095     decode_end,
8096     decode_frame,
8097     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8098     .flush= flush_dpb,
8099     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8100 };
8101
8102 #if CONFIG_H264_VDPAU_DECODER
8103 AVCodec h264_vdpau_decoder = {
8104     "h264_vdpau",
8105     CODEC_TYPE_VIDEO,
8106     CODEC_ID_H264,
8107     sizeof(H264Context),
8108     decode_init,
8109     NULL,
8110     decode_end,
8111     decode_frame,
8112     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8113     .flush= flush_dpb,
8114     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8115 };
8116 #endif
8117
8118 #if CONFIG_SVQ3_DECODER
8119 #include "svq3.c"
8120 #endif