libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 997
 998     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 999         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1000             int cur_poc = s->current_picture_ptr->poc;
1001             int *col_poc = h->ref_list[1]->field_poc;
1002             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1003             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1004             b8_stride = 0;
1005         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1006             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1007             mb_xy += s->mb_stride*fieldoff;
1008         }
1009         goto single_col;
1010     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1011         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1012             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1013             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1014             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1015             b8_stride *= 3;
1016             b4_stride *= 6;
1017             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1018             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1019                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1020                 && !is_b8x8){
1021                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1022                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1023             }else{
1024                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1025                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1026             }
1027         }else{                                           //     AFR/FR    -> AFR/FR
1028 single_col:
1029             mb_type_col[0] =
1030             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1031             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1032                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1033                 * so we know exactly what block size to use */
1034                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1035                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1036             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1037                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1038                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1039             }else{
1040                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }
1043         }
1044     }
1045
1046     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1047     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1048     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1049     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1050     if(!b8_stride){
1051         if(s->mb_y&1){
1052             l1ref0 += h->b8_stride;
1053             l1ref1 += h->b8_stride;
1054             l1mv0  +=  2*b4_stride;
1055             l1mv1  +=  2*b4_stride;
1056         }
1057     }
1058
1059     if(h->direct_spatial_mv_pred){
1060         int ref[2];
1061         int mv[2][2];
1062         int list;
1063
1064         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1065
1066         /* ref = min(neighbors) */
1067         for(list=0; list<2; list++){
1068             int refa = h->ref_cache[list][scan8[0] - 1];
1069             int refb = h->ref_cache[list][scan8[0] - 8];
1070             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1071             if(refc == PART_NOT_AVAILABLE)
1072                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1073             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1074             if(ref[list] < 0)
1075                 ref[list] = -1;
1076         }
1077
1078         if(ref[0] < 0 && ref[1] < 0){
1079             ref[0] = ref[1] = 0;
1080             mv[0][0] = mv[0][1] =
1081             mv[1][0] = mv[1][1] = 0;
1082         }else{
1083             for(list=0; list<2; list++){
1084                 if(ref[list] >= 0)
1085                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1086                 else
1087                     mv[list][0] = mv[list][1] = 0;
1088             }
1089         }
1090
1091         if(ref[1] < 0){
1092             if(!is_b8x8)
1093                 *mb_type &= ~MB_TYPE_L1;
1094             sub_mb_type &= ~MB_TYPE_L1;
1095         }else if(ref[0] < 0){
1096             if(!is_b8x8)
1097                 *mb_type &= ~MB_TYPE_L0;
1098             sub_mb_type &= ~MB_TYPE_L0;
1099         }
1100
1101         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1102             for(i8=0; i8<4; i8++){
1103                 int x8 = i8&1;
1104                 int y8 = i8>>1;
1105                 int xy8 = x8+y8*b8_stride;
1106                 int xy4 = 3*x8+y8*b4_stride;
1107                 int a=0, b=0;
1108
1109                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1110                     continue;
1111                 h->sub_mb_type[i8] = sub_mb_type;
1112
1113                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1114                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1115                 if(!IS_INTRA(mb_type_col[y8])
1116                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1117                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1118                     if(ref[0] > 0)
1119                         a= pack16to32(mv[0][0],mv[0][1]);
1120                     if(ref[1] > 0)
1121                         b= pack16to32(mv[1][0],mv[1][1]);
1122                 }else{
1123                     a= pack16to32(mv[0][0],mv[0][1]);
1124                     b= pack16to32(mv[1][0],mv[1][1]);
1125                 }
1126                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1127                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1128             }
1129         }else if(IS_16X16(*mb_type)){
1130             int a=0, b=0;
1131
1132             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1133             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1134             if(!IS_INTRA(mb_type_col[0])
1135                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1136                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1137                        && (h->x264_build>33 || !h->x264_build)))){
1138                 if(ref[0] > 0)
1139                     a= pack16to32(mv[0][0],mv[0][1]);
1140                 if(ref[1] > 0)
1141                     b= pack16to32(mv[1][0],mv[1][1]);
1142             }else{
1143                 a= pack16to32(mv[0][0],mv[0][1]);
1144                 b= pack16to32(mv[1][0],mv[1][1]);
1145             }
1146             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1147             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1148         }else{
1149             for(i8=0; i8<4; i8++){
1150                 const int x8 = i8&1;
1151                 const int y8 = i8>>1;
1152
1153                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1154                     continue;
1155                 h->sub_mb_type[i8] = sub_mb_type;
1156
1157                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1158                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1159                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1160                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1161
1162                 /* col_zero_flag */
1163                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1164                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1165                                                   && (h->x264_build>33 || !h->x264_build)))){
1166                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1167                     if(IS_SUB_8X8(sub_mb_type)){
1168                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1169                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1170                             if(ref[0] == 0)
1171                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1172                             if(ref[1] == 0)
1173                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                         }
1175                     }else
1176                     for(i4=0; i4<4; i4++){
1177                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1178                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1179                             if(ref[0] == 0)
1180                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1181                             if(ref[1] == 0)
1182                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1183                         }
1184                     }
1185                 }
1186             }
1187         }
1188     }else{ /* direct temporal mv pred */
1189         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1190         const int *dist_scale_factor = h->dist_scale_factor;
1191         int ref_offset= 0;
1192
1193         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1194             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1195             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1196             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1197         }
1198         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1199             ref_offset += 16;
1200
1201         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1202             /* FIXME assumes direct_8x8_inference == 1 */
1203             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1204
1205             for(i8=0; i8<4; i8++){
1206                 const int x8 = i8&1;
1207                 const int y8 = i8>>1;
1208                 int ref0, scale;
1209                 const int16_t (*l1mv)[2]= l1mv0;
1210
1211                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1212                     continue;
1213                 h->sub_mb_type[i8] = sub_mb_type;
1214
1215                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1216                 if(IS_INTRA(mb_type_col[y8])){
1217                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1219                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1220                     continue;
1221                 }
1222
1223                 ref0 = l1ref0[x8 + y8*b8_stride];
1224                 if(ref0 >= 0)
1225                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1226                 else{
1227                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1228                     l1mv= l1mv1;
1229                 }
1230                 scale = dist_scale_factor[ref0];
1231                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1232
1233                 {
1234                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1235                     int my_col = (mv_col[1]<<y_shift)/2;
1236                     int mx = (scale * mv_col[0] + 128) >> 8;
1237                     int my = (scale * my_col + 128) >> 8;
1238                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1239                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1240                 }
1241             }
1242             return;
1243         }
1244
1245         /* one-to-one mv scaling */
1246
1247         if(IS_16X16(*mb_type)){
1248             int ref, mv0, mv1;
1249
1250             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1251             if(IS_INTRA(mb_type_col[0])){
1252                 ref=mv0=mv1=0;
1253             }else{
1254                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1255                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1256                 const int scale = dist_scale_factor[ref0];
1257                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1258                 int mv_l0[2];
1259                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1260                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1261                 ref= ref0;
1262                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1263                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1264             }
1265             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1266             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1267             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1268         }else{
1269             for(i8=0; i8<4; i8++){
1270                 const int x8 = i8&1;
1271                 const int y8 = i8>>1;
1272                 int ref0, scale;
1273                 const int16_t (*l1mv)[2]= l1mv0;
1274
1275                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1276                     continue;
1277                 h->sub_mb_type[i8] = sub_mb_type;
1278                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1279                 if(IS_INTRA(mb_type_col[0])){
1280                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1282                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1283                     continue;
1284                 }
1285
1286                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1287                 if(ref0 >= 0)
1288                     ref0 = map_col_to_list0[0][ref0];
1289                 else{
1290                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1291                     l1mv= l1mv1;
1292                 }
1293                 scale = dist_scale_factor[ref0];
1294
1295                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1296                 if(IS_SUB_8X8(sub_mb_type)){
1297                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1298                     int mx = (scale * mv_col[0] + 128) >> 8;
1299                     int my = (scale * mv_col[1] + 128) >> 8;
1300                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1301                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1302                 }else
1303                 for(i4=0; i4<4; i4++){
1304                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1305                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1306                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1307                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1308                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1309                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1310                 }
1311             }
1312         }
1313     }
1314 }
1315
1316 static inline void write_back_motion(H264Context *h, int mb_type){
1317     MpegEncContext * const s = &h->s;
1318     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1319     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1320     int list;
1321
1322     if(!USES_LIST(mb_type, 0))
1323         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1324
1325     for(list=0; list<h->list_count; list++){
1326         int y;
1327         if(!USES_LIST(mb_type, list))
1328             continue;
1329
1330         for(y=0; y<4; y++){
1331             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1332             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1333         }
1334         if( h->pps.cabac ) {
1335             if(IS_SKIP(mb_type))
1336                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1337             else
1338             for(y=0; y<4; y++){
1339                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1340                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1341             }
1342         }
1343
1344         {
1345             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1346             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1347             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1348             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1349             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1350         }
1351     }
1352
1353     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1354         if(IS_8X8(mb_type)){
1355             uint8_t *direct_table = &h->direct_table[b8_xy];
1356             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1357             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1358             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1359         }
1360     }
1361 }
1362
1363 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1364     int i, si, di;
1365     uint8_t *dst;
1366     int bufidx;
1367
1368 //    src[0]&0x80;                //forbidden bit
1369     h->nal_ref_idc= src[0]>>5;
1370     h->nal_unit_type= src[0]&0x1F;
1371
1372     src++; length--;
1373 #if 0
1374     for(i=0; i<length; i++)
1375         printf("%2X ", src[i]);
1376 #endif
1377
1378 #if HAVE_FAST_UNALIGNED
1379 # if HAVE_FAST_64BIT
1380 #   define RS 7
1381     for(i=0; i+1<length; i+=9){
1382         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1383 # else
1384 #   define RS 3
1385     for(i=0; i+1<length; i+=5){
1386         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1387 # endif
1388             continue;
1389         if(i>0 && !src[i]) i--;
1390         while(src[i]) i++;
1391 #else
1392 #   define RS 0
1393     for(i=0; i+1<length; i+=2){
1394         if(src[i]) continue;
1395         if(i>0 && src[i-1]==0) i--;
1396 #endif
1397         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1398             if(src[i+2]!=3){
1399                 /* startcode, so we must be past the end */
1400                 length=i;
1401             }
1402             break;
1403         }
1404         i-= RS;
1405     }
1406
1407     if(i>=length-1){ //no escaped 0
1408         *dst_length= length;
1409         *consumed= length+1; //+1 for the header
1410         return src;
1411     }
1412
1413     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1414     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1415     dst= h->rbsp_buffer[bufidx];
1416
1417     if (dst == NULL){
1418         return NULL;
1419     }
1420
1421 //printf("decoding esc\n");
1422     memcpy(dst, src, i);
1423     si=di=i;
1424     while(si+2<length){
1425         //remove escapes (very rare 1:2^22)
1426         if(src[si+2]>3){
1427             dst[di++]= src[si++];
1428             dst[di++]= src[si++];
1429         }else if(src[si]==0 && src[si+1]==0){
1430             if(src[si+2]==3){ //escape
1431                 dst[di++]= 0;
1432                 dst[di++]= 0;
1433                 si+=3;
1434                 continue;
1435             }else //next start code
1436                 goto nsc;
1437         }
1438
1439         dst[di++]= src[si++];
1440     }
1441     while(si<length)
1442         dst[di++]= src[si++];
1443 nsc:
1444
1445     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1446
1447     *dst_length= di;
1448     *consumed= si + 1;//+1 for the header
1449 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1450     return dst;
1451 }
1452
1453 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1454     int v= *src;
1455     int r;
1456
1457     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1458
1459     for(r=1; r<9; r++){
1460         if(v&1) return r;
1461         v>>=1;
1462     }
1463     return 0;
1464 }
1465
1466 /**
1467  * IDCT transforms the 16 dc values and dequantizes them.
1468  * @param qp quantization parameter
1469  */
1470 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1471 #define stride 16
1472     int i;
1473     int temp[16]; //FIXME check if this is a good idea
1474     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1475     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1476
1477 //memset(block, 64, 2*256);
1478 //return;
1479     for(i=0; i<4; i++){
1480         const int offset= y_offset[i];
1481         const int z0= block[offset+stride*0] + block[offset+stride*4];
1482         const int z1= block[offset+stride*0] - block[offset+stride*4];
1483         const int z2= block[offset+stride*1] - block[offset+stride*5];
1484         const int z3= block[offset+stride*1] + block[offset+stride*5];
1485
1486         temp[4*i+0]= z0+z3;
1487         temp[4*i+1]= z1+z2;
1488         temp[4*i+2]= z1-z2;
1489         temp[4*i+3]= z0-z3;
1490     }
1491
1492     for(i=0; i<4; i++){
1493         const int offset= x_offset[i];
1494         const int z0= temp[4*0+i] + temp[4*2+i];
1495         const int z1= temp[4*0+i] - temp[4*2+i];
1496         const int z2= temp[4*1+i] - temp[4*3+i];
1497         const int z3= temp[4*1+i] + temp[4*3+i];
1498
1499         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1500         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1501         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1502         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1503     }
1504 }
1505
1506 #if 0
1507 /**
1508  * DCT transforms the 16 dc values.
1509  * @param qp quantization parameter ??? FIXME
1510  */
1511 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1512 //    const int qmul= dequant_coeff[qp][0];
1513     int i;
1514     int temp[16]; //FIXME check if this is a good idea
1515     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1516     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1517
1518     for(i=0; i<4; i++){
1519         const int offset= y_offset[i];
1520         const int z0= block[offset+stride*0] + block[offset+stride*4];
1521         const int z1= block[offset+stride*0] - block[offset+stride*4];
1522         const int z2= block[offset+stride*1] - block[offset+stride*5];
1523         const int z3= block[offset+stride*1] + block[offset+stride*5];
1524
1525         temp[4*i+0]= z0+z3;
1526         temp[4*i+1]= z1+z2;
1527         temp[4*i+2]= z1-z2;
1528         temp[4*i+3]= z0-z3;
1529     }
1530
1531     for(i=0; i<4; i++){
1532         const int offset= x_offset[i];
1533         const int z0= temp[4*0+i] + temp[4*2+i];
1534         const int z1= temp[4*0+i] - temp[4*2+i];
1535         const int z2= temp[4*1+i] - temp[4*3+i];
1536         const int z3= temp[4*1+i] + temp[4*3+i];
1537
1538         block[stride*0 +offset]= (z0 + z3)>>1;
1539         block[stride*2 +offset]= (z1 + z2)>>1;
1540         block[stride*8 +offset]= (z1 - z2)>>1;
1541         block[stride*10+offset]= (z0 - z3)>>1;
1542     }
1543 }
1544 #endif
1545
1546 #undef xStride
1547 #undef stride
1548
1549 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1550     const int stride= 16*2;
1551     const int xStride= 16;
1552     int a,b,c,d,e;
1553
1554     a= block[stride*0 + xStride*0];
1555     b= block[stride*0 + xStride*1];
1556     c= block[stride*1 + xStride*0];
1557     d= block[stride*1 + xStride*1];
1558
1559     e= a-b;
1560     a= a+b;
1561     b= c-d;
1562     c= c+d;
1563
1564     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1565     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1566     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1567     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1568 }
1569
1570 #if 0
1571 static void chroma_dc_dct_c(DCTELEM *block){
1572     const int stride= 16*2;
1573     const int xStride= 16;
1574     int a,b,c,d,e;
1575
1576     a= block[stride*0 + xStride*0];
1577     b= block[stride*0 + xStride*1];
1578     c= block[stride*1 + xStride*0];
1579     d= block[stride*1 + xStride*1];
1580
1581     e= a-b;
1582     a= a+b;
1583     b= c-d;
1584     c= c+d;
1585
1586     block[stride*0 + xStride*0]= (a+c);
1587     block[stride*0 + xStride*1]= (e+b);
1588     block[stride*1 + xStride*0]= (a-c);
1589     block[stride*1 + xStride*1]= (e-b);
1590 }
1591 #endif
1592
1593 /**
1594  * gets the chroma qp.
1595  */
1596 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1597     return h->pps.chroma_qp_table[t][qscale];
1598 }
1599
1600 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1601                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1602                            int src_x_offset, int src_y_offset,
1603                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1604     MpegEncContext * const s = &h->s;
1605     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1606     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1607     const int luma_xy= (mx&3) + ((my&3)<<2);
1608     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1609     uint8_t * src_cb, * src_cr;
1610     int extra_width= h->emu_edge_width;
1611     int extra_height= h->emu_edge_height;
1612     int emu=0;
1613     const int full_mx= mx>>2;
1614     const int full_my= my>>2;
1615     const int pic_width  = 16*s->mb_width;
1616     const int pic_height = 16*s->mb_height >> MB_FIELD;
1617
1618     if(mx&7) extra_width -= 3;
1619     if(my&7) extra_height -= 3;
1620
1621     if(   full_mx < 0-extra_width
1622        || full_my < 0-extra_height
1623        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1624        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1625         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1626             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1627         emu=1;
1628     }
1629
1630     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1631     if(!square){
1632         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1633     }
1634
1635     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1636
1637     if(MB_FIELD){
1638         // chroma offset when predicting from a field of opposite parity
1639         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1640         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1641     }
1642     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1643     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1644
1645     if(emu){
1646         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1647             src_cb= s->edge_emu_buffer;
1648     }
1649     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1650
1651     if(emu){
1652         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1653             src_cr= s->edge_emu_buffer;
1654     }
1655     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1656 }
1657
1658 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1659                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1660                            int x_offset, int y_offset,
1661                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1662                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1663                            int list0, int list1){
1664     MpegEncContext * const s = &h->s;
1665     qpel_mc_func *qpix_op=  qpix_put;
1666     h264_chroma_mc_func chroma_op= chroma_put;
1667
1668     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1669     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1670     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1671     x_offset += 8*s->mb_x;
1672     y_offset += 8*(s->mb_y >> MB_FIELD);
1673
1674     if(list0){
1675         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1676         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1677                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1678                            qpix_op, chroma_op);
1679
1680         qpix_op=  qpix_avg;
1681         chroma_op= chroma_avg;
1682     }
1683
1684     if(list1){
1685         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1686         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1687                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1688                            qpix_op, chroma_op);
1689     }
1690 }
1691
1692 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1693                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1694                            int x_offset, int y_offset,
1695                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1696                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1697                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1698                            int list0, int list1){
1699     MpegEncContext * const s = &h->s;
1700
1701     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1702     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1703     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1704     x_offset += 8*s->mb_x;
1705     y_offset += 8*(s->mb_y >> MB_FIELD);
1706
1707     if(list0 && list1){
1708         /* don't optimize for luma-only case, since B-frames usually
1709          * use implicit weights => chroma too. */
1710         uint8_t *tmp_cb = s->obmc_scratchpad;
1711         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1712         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1713         int refn0 = h->ref_cache[0][ scan8[n] ];
1714         int refn1 = h->ref_cache[1][ scan8[n] ];
1715
1716         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1717                     dest_y, dest_cb, dest_cr,
1718                     x_offset, y_offset, qpix_put, chroma_put);
1719         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1720                     tmp_y, tmp_cb, tmp_cr,
1721                     x_offset, y_offset, qpix_put, chroma_put);
1722
1723         if(h->use_weight == 2){
1724             int weight0 = h->implicit_weight[refn0][refn1];
1725             int weight1 = 64 - weight0;
1726             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1727             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1729         }else{
1730             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1731                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1732                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1733             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1734                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1735                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1736             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1737                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1738                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1739         }
1740     }else{
1741         int list = list1 ? 1 : 0;
1742         int refn = h->ref_cache[list][ scan8[n] ];
1743         Picture *ref= &h->ref_list[list][refn];
1744         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1745                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1746                     qpix_put, chroma_put);
1747
1748         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1749                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1750         if(h->use_weight_chroma){
1751             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1752                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1753             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1755         }
1756     }
1757 }
1758
1759 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1760                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1761                            int x_offset, int y_offset,
1762                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1763                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1764                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1765                            int list0, int list1){
1766     if((h->use_weight==2 && list0 && list1
1767         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1768        || h->use_weight==1)
1769         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1770                          x_offset, y_offset, qpix_put, chroma_put,
1771                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1772     else
1773         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1774                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1775 }
1776
1777 static inline void prefetch_motion(H264Context *h, int list){
1778     /* fetch pixels for estimated mv 4 macroblocks ahead
1779      * optimized for 64byte cache lines */
1780     MpegEncContext * const s = &h->s;
1781     const int refn = h->ref_cache[list][scan8[0]];
1782     if(refn >= 0){
1783         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1784         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1785         uint8_t **src= h->ref_list[list][refn].data;
1786         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1787         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1788         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1789         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1790     }
1791 }
1792
1793 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1794                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1795                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1796                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1797     MpegEncContext * const s = &h->s;
1798     const int mb_xy= h->mb_xy;
1799     const int mb_type= s->current_picture.mb_type[mb_xy];
1800
1801     assert(IS_INTER(mb_type));
1802
1803     prefetch_motion(h, 0);
1804
1805     if(IS_16X16(mb_type)){
1806         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1807                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1808                 &weight_op[0], &weight_avg[0],
1809                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1810     }else if(IS_16X8(mb_type)){
1811         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1812                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1813                 &weight_op[1], &weight_avg[1],
1814                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1815         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1816                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1817                 &weight_op[1], &weight_avg[1],
1818                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1819     }else if(IS_8X16(mb_type)){
1820         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1821                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1822                 &weight_op[2], &weight_avg[2],
1823                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1824         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1825                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1826                 &weight_op[2], &weight_avg[2],
1827                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1828     }else{
1829         int i;
1830
1831         assert(IS_8X8(mb_type));
1832
1833         for(i=0; i<4; i++){
1834             const int sub_mb_type= h->sub_mb_type[i];
1835             const int n= 4*i;
1836             int x_offset= (i&1)<<2;
1837             int y_offset= (i&2)<<1;
1838
1839             if(IS_SUB_8X8(sub_mb_type)){
1840                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1841                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1842                     &weight_op[3], &weight_avg[3],
1843                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1844             }else if(IS_SUB_8X4(sub_mb_type)){
1845                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1846                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1847                     &weight_op[4], &weight_avg[4],
1848                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1849                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1850                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1851                     &weight_op[4], &weight_avg[4],
1852                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1853             }else if(IS_SUB_4X8(sub_mb_type)){
1854                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1855                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1856                     &weight_op[5], &weight_avg[5],
1857                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1858                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1859                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1860                     &weight_op[5], &weight_avg[5],
1861                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862             }else{
1863                 int j;
1864                 assert(IS_SUB_4X4(sub_mb_type));
1865                 for(j=0; j<4; j++){
1866                     int sub_x_offset= x_offset + 2*(j&1);
1867                     int sub_y_offset= y_offset +   (j&2);
1868                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1869                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1870                         &weight_op[6], &weight_avg[6],
1871                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1872                 }
1873             }
1874         }
1875     }
1876
1877     prefetch_motion(h, 1);
1878 }
1879
1880 static av_cold void init_cavlc_level_tab(void){
1881     int suffix_length, mask;
1882     unsigned int i;
1883
1884     for(suffix_length=0; suffix_length<7; suffix_length++){
1885         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1886             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1887             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1888
1889             mask= -(level_code&1);
1890             level_code= (((2+level_code)>>1) ^ mask) - mask;
1891             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1892                 cavlc_level_tab[suffix_length][i][0]= level_code;
1893                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1894             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1895                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1896                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1897             }else{
1898                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1899                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1900             }
1901         }
1902     }
1903 }
1904
1905 static av_cold void decode_init_vlc(void){
1906     static int done = 0;
1907
1908     if (!done) {
1909         int i;
1910         int offset;
1911         done = 1;
1912
1913         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1914         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1915         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1916                  &chroma_dc_coeff_token_len [0], 1, 1,
1917                  &chroma_dc_coeff_token_bits[0], 1, 1,
1918                  INIT_VLC_USE_NEW_STATIC);
1919
1920         offset = 0;
1921         for(i=0; i<4; i++){
1922             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1923             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1924             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1925                      &coeff_token_len [i][0], 1, 1,
1926                      &coeff_token_bits[i][0], 1, 1,
1927                      INIT_VLC_USE_NEW_STATIC);
1928             offset += coeff_token_vlc_tables_size[i];
1929         }
1930         /*
1931          * This is a one time safety check to make sure that
1932          * the packed static coeff_token_vlc table sizes
1933          * were initialized correctly.
1934          */
1935         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1936
1937         for(i=0; i<3; i++){
1938             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1939             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1940             init_vlc(&chroma_dc_total_zeros_vlc[i],
1941                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1942                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1943                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1944                      INIT_VLC_USE_NEW_STATIC);
1945         }
1946         for(i=0; i<15; i++){
1947             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1948             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1949             init_vlc(&total_zeros_vlc[i],
1950                      TOTAL_ZEROS_VLC_BITS, 16,
1951                      &total_zeros_len [i][0], 1, 1,
1952                      &total_zeros_bits[i][0], 1, 1,
1953                      INIT_VLC_USE_NEW_STATIC);
1954         }
1955
1956         for(i=0; i<6; i++){
1957             run_vlc[i].table = run_vlc_tables[i];
1958             run_vlc[i].table_allocated = run_vlc_tables_size;
1959             init_vlc(&run_vlc[i],
1960                      RUN_VLC_BITS, 7,
1961                      &run_len [i][0], 1, 1,
1962                      &run_bits[i][0], 1, 1,
1963                      INIT_VLC_USE_NEW_STATIC);
1964         }
1965         run7_vlc.table = run7_vlc_table,
1966         run7_vlc.table_allocated = run7_vlc_table_size;
1967         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1968                  &run_len [6][0], 1, 1,
1969                  &run_bits[6][0], 1, 1,
1970                  INIT_VLC_USE_NEW_STATIC);
1971
1972         init_cavlc_level_tab();
1973     }
1974 }
1975
1976 static void free_tables(H264Context *h){
1977     int i;
1978     H264Context *hx;
1979     av_freep(&h->intra4x4_pred_mode);
1980     av_freep(&h->chroma_pred_mode_table);
1981     av_freep(&h->cbp_table);
1982     av_freep(&h->mvd_table[0]);
1983     av_freep(&h->mvd_table[1]);
1984     av_freep(&h->direct_table);
1985     av_freep(&h->non_zero_count);
1986     av_freep(&h->slice_table_base);
1987     h->slice_table= NULL;
1988
1989     av_freep(&h->mb2b_xy);
1990     av_freep(&h->mb2b8_xy);
1991
1992     for(i = 0; i < h->s.avctx->thread_count; i++) {
1993         hx = h->thread_context[i];
1994         if(!hx) continue;
1995         av_freep(&hx->top_borders[1]);
1996         av_freep(&hx->top_borders[0]);
1997         av_freep(&hx->s.obmc_scratchpad);
1998     }
1999 }
2000
2001 static void init_dequant8_coeff_table(H264Context *h){
2002     int i,q,x;
2003     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2004     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2005     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2006
2007     for(i=0; i<2; i++ ){
2008         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2009             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2010             break;
2011         }
2012
2013         for(q=0; q<52; q++){
2014             int shift = div6[q];
2015             int idx = rem6[q];
2016             for(x=0; x<64; x++)
2017                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2018                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2019                     h->pps.scaling_matrix8[i][x]) << shift;
2020         }
2021     }
2022 }
2023
2024 static void init_dequant4_coeff_table(H264Context *h){
2025     int i,j,q,x;
2026     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2027     for(i=0; i<6; i++ ){
2028         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2029         for(j=0; j<i; j++){
2030             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2031                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2032                 break;
2033             }
2034         }
2035         if(j<i)
2036             continue;
2037
2038         for(q=0; q<52; q++){
2039             int shift = div6[q] + 2;
2040             int idx = rem6[q];
2041             for(x=0; x<16; x++)
2042                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2043                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2044                     h->pps.scaling_matrix4[i][x]) << shift;
2045         }
2046     }
2047 }
2048
2049 static void init_dequant_tables(H264Context *h){
2050     int i,x;
2051     init_dequant4_coeff_table(h);
2052     if(h->pps.transform_8x8_mode)
2053         init_dequant8_coeff_table(h);
2054     if(h->sps.transform_bypass){
2055         for(i=0; i<6; i++)
2056             for(x=0; x<16; x++)
2057                 h->dequant4_coeff[i][0][x] = 1<<6;
2058         if(h->pps.transform_8x8_mode)
2059             for(i=0; i<2; i++)
2060                 for(x=0; x<64; x++)
2061                     h->dequant8_coeff[i][0][x] = 1<<6;
2062     }
2063 }
2064
2065
2066 /**
2067  * allocates tables.
2068  * needs width/height
2069  */
2070 static int alloc_tables(H264Context *h){
2071     MpegEncContext * const s = &h->s;
2072     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2073     int x,y;
2074
2075     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2076
2077     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2078     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2079     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2080
2081     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2082     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2083     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2084     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2085
2086     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2087     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2088
2089     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2090     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2091     for(y=0; y<s->mb_height; y++){
2092         for(x=0; x<s->mb_width; x++){
2093             const int mb_xy= x + y*s->mb_stride;
2094             const int b_xy = 4*x + 4*y*h->b_stride;
2095             const int b8_xy= 2*x + 2*y*h->b8_stride;
2096
2097             h->mb2b_xy [mb_xy]= b_xy;
2098             h->mb2b8_xy[mb_xy]= b8_xy;
2099         }
2100     }
2101
2102     s->obmc_scratchpad = NULL;
2103
2104     if(!h->dequant4_coeff[0])
2105         init_dequant_tables(h);
2106
2107     return 0;
2108 fail:
2109     free_tables(h);
2110     return -1;
2111 }
2112
2113 /**
2114  * Mimic alloc_tables(), but for every context thread.
2115  */
2116 static void clone_tables(H264Context *dst, H264Context *src){
2117     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2118     dst->non_zero_count           = src->non_zero_count;
2119     dst->slice_table              = src->slice_table;
2120     dst->cbp_table                = src->cbp_table;
2121     dst->mb2b_xy                  = src->mb2b_xy;
2122     dst->mb2b8_xy                 = src->mb2b8_xy;
2123     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2124     dst->mvd_table[0]             = src->mvd_table[0];
2125     dst->mvd_table[1]             = src->mvd_table[1];
2126     dst->direct_table             = src->direct_table;
2127
2128     dst->s.obmc_scratchpad = NULL;
2129     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2130 }
2131
2132 /**
2133  * Init context
2134  * Allocate buffers which are not shared amongst multiple threads.
2135  */
2136 static int context_init(H264Context *h){
2137     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2138     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2139
2140     return 0;
2141 fail:
2142     return -1; // free_tables will clean up for us
2143 }
2144
2145 static av_cold void common_init(H264Context *h){
2146     MpegEncContext * const s = &h->s;
2147
2148     s->width = s->avctx->width;
2149     s->height = s->avctx->height;
2150     s->codec_id= s->avctx->codec->id;
2151
2152     ff_h264_pred_init(&h->hpc, s->codec_id);
2153
2154     h->dequant_coeff_pps= -1;
2155     s->unrestricted_mv=1;
2156     s->decode=1; //FIXME
2157
2158     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2159
2160     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2161     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2162 }
2163
2164 static av_cold int decode_init(AVCodecContext *avctx){
2165     H264Context *h= avctx->priv_data;
2166     MpegEncContext * const s = &h->s;
2167
2168     MPV_decode_defaults(s);
2169
2170     s->avctx = avctx;
2171     common_init(h);
2172
2173     s->out_format = FMT_H264;
2174     s->workaround_bugs= avctx->workaround_bugs;
2175
2176     // set defaults
2177 //    s->decode_mb= ff_h263_decode_mb;
2178     s->quarter_sample = 1;
2179     s->low_delay= 1;
2180
2181     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2182         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2183     else
2184         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2185     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2186
2187     decode_init_vlc();
2188
2189     if(avctx->extradata_size > 0 && avctx->extradata &&
2190        *(char *)avctx->extradata == 1){
2191         h->is_avc = 1;
2192         h->got_avcC = 0;
2193     } else {
2194         h->is_avc = 0;
2195     }
2196
2197     h->thread_context[0] = h;
2198     h->outputed_poc = INT_MIN;
2199     h->prev_poc_msb= 1<<16;
2200     h->sei_recovery_frame_cnt = -1;
2201     h->sei_dpb_output_delay = 0;
2202     h->sei_cpb_removal_delay = -1;
2203     h->sei_buffering_period_present = 0;
2204     return 0;
2205 }
2206
2207 static int frame_start(H264Context *h){
2208     MpegEncContext * const s = &h->s;
2209     int i;
2210
2211     if(MPV_frame_start(s, s->avctx) < 0)
2212         return -1;
2213     ff_er_frame_start(s);
2214     /*
2215      * MPV_frame_start uses pict_type to derive key_frame.
2216      * This is incorrect for H.264; IDR markings must be used.
2217      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2218      * See decode_nal_units().
2219      */
2220     s->current_picture_ptr->key_frame= 0;
2221
2222     assert(s->linesize && s->uvlinesize);
2223
2224     for(i=0; i<16; i++){
2225         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2226         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2227     }
2228     for(i=0; i<4; i++){
2229         h->block_offset[16+i]=
2230         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2231         h->block_offset[24+16+i]=
2232         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2233     }
2234
2235     /* can't be in alloc_tables because linesize isn't known there.
2236      * FIXME: redo bipred weight to not require extra buffer? */
2237     for(i = 0; i < s->avctx->thread_count; i++)
2238         if(!h->thread_context[i]->s.obmc_scratchpad)
2239             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2240
2241     /* some macroblocks will be accessed before they're available */
2242     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2243         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2244
2245 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2246
2247     // We mark the current picture as non-reference after allocating it, so
2248     // that if we break out due to an error it can be released automatically
2249     // in the next MPV_frame_start().
2250     // SVQ3 as well as most other codecs have only last/next/current and thus
2251     // get released even with set reference, besides SVQ3 and others do not
2252     // mark frames as reference later "naturally".
2253     if(s->codec_id != CODEC_ID_SVQ3)
2254         s->current_picture_ptr->reference= 0;
2255
2256     s->current_picture_ptr->field_poc[0]=
2257     s->current_picture_ptr->field_poc[1]= INT_MAX;
2258     assert(s->current_picture_ptr->long_ref==0);
2259
2260     return 0;
2261 }
2262
2263 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2264     MpegEncContext * const s = &h->s;
2265     int i;
2266     int step    = 1;
2267     int offset  = 1;
2268     int uvoffset= 1;
2269     int top_idx = 1;
2270     int skiplast= 0;
2271
2272     src_y  -=   linesize;
2273     src_cb -= uvlinesize;
2274     src_cr -= uvlinesize;
2275
2276     if(!simple && FRAME_MBAFF){
2277         if(s->mb_y&1){
2278             offset  = MB_MBAFF ? 1 : 17;
2279             uvoffset= MB_MBAFF ? 1 : 9;
2280             if(!MB_MBAFF){
2281                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2282                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2283                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2284                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2285                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2286                 }
2287             }
2288         }else{
2289             if(!MB_MBAFF){
2290                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2291                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2292                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2293                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2294                 }
2295                 skiplast= 1;
2296             }
2297             offset  =
2298             uvoffset=
2299             top_idx = MB_MBAFF ? 0 : 1;
2300         }
2301         step= MB_MBAFF ? 2 : 1;
2302     }
2303
2304     // There are two lines saved, the line above the the top macroblock of a pair,
2305     // and the line above the bottom macroblock
2306     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2307     for(i=1; i<17 - skiplast; i++){
2308         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2309     }
2310
2311     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2312     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2313
2314     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2315         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2316         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2317         for(i=1; i<9 - skiplast; i++){
2318             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2319             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2320         }
2321         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2322         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2323     }
2324 }
2325
2326 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2327     MpegEncContext * const s = &h->s;
2328     int temp8, i;
2329     uint64_t temp64;
2330     int deblock_left;
2331     int deblock_top;
2332     int mb_xy;
2333     int step    = 1;
2334     int offset  = 1;
2335     int uvoffset= 1;
2336     int top_idx = 1;
2337
2338     if(!simple && FRAME_MBAFF){
2339         if(s->mb_y&1){
2340             offset  = MB_MBAFF ? 1 : 17;
2341             uvoffset= MB_MBAFF ? 1 : 9;
2342         }else{
2343             offset  =
2344             uvoffset=
2345             top_idx = MB_MBAFF ? 0 : 1;
2346         }
2347         step= MB_MBAFF ? 2 : 1;
2348     }
2349
2350     if(h->deblocking_filter == 2) {
2351         mb_xy = h->mb_xy;
2352         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2353         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2354     } else {
2355         deblock_left = (s->mb_x > 0);
2356         deblock_top =  (s->mb_y > !!MB_FIELD);
2357     }
2358
2359     src_y  -=   linesize + 1;
2360     src_cb -= uvlinesize + 1;
2361     src_cr -= uvlinesize + 1;
2362
2363 #define XCHG(a,b,t,xchg)\
2364 t= a;\
2365 if(xchg)\
2366     a= b;\
2367 b= t;
2368
2369     if(deblock_left){
2370         for(i = !deblock_top; i<16; i++){
2371             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2372         }
2373         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2374     }
2375
2376     if(deblock_top){
2377         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2378         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2379         if(s->mb_x+1 < s->mb_width){
2380             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2381         }
2382     }
2383
2384     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2385         if(deblock_left){
2386             for(i = !deblock_top; i<8; i++){
2387                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2388                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2389             }
2390             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2391             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2392         }
2393         if(deblock_top){
2394             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2395             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2396         }
2397     }
2398 }
2399
2400 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2401     MpegEncContext * const s = &h->s;
2402     const int mb_x= s->mb_x;
2403     const int mb_y= s->mb_y;
2404     const int mb_xy= h->mb_xy;
2405     const int mb_type= s->current_picture.mb_type[mb_xy];
2406     uint8_t  *dest_y, *dest_cb, *dest_cr;
2407     int linesize, uvlinesize /*dct_offset*/;
2408     int i;
2409     int *block_offset = &h->block_offset[0];
2410     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2411     /* is_h264 should always be true if SVQ3 is disabled. */
2412     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2413     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2414     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2415
2416     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2417     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2418     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2419
2420     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2421     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2422
2423     if (!simple && MB_FIELD) {
2424         linesize   = h->mb_linesize   = s->linesize * 2;
2425         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2426         block_offset = &h->block_offset[24];
2427         if(mb_y&1){ //FIXME move out of this function?
2428             dest_y -= s->linesize*15;
2429             dest_cb-= s->uvlinesize*7;
2430             dest_cr-= s->uvlinesize*7;
2431         }
2432         if(FRAME_MBAFF) {
2433             int list;
2434             for(list=0; list<h->list_count; list++){
2435                 if(!USES_LIST(mb_type, list))
2436                     continue;
2437                 if(IS_16X16(mb_type)){
2438                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2439                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2440                 }else{
2441                     for(i=0; i<16; i+=4){
2442                         int ref = h->ref_cache[list][scan8[i]];
2443                         if(ref >= 0)
2444                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2445                     }
2446                 }
2447             }
2448         }
2449     } else {
2450         linesize   = h->mb_linesize   = s->linesize;
2451         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2452 //        dct_offset = s->linesize * 16;
2453     }
2454
2455     if (!simple && IS_INTRA_PCM(mb_type)) {
2456         for (i=0; i<16; i++) {
2457             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2458         }
2459         for (i=0; i<8; i++) {
2460             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2461             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2462         }
2463     } else {
2464         if(IS_INTRA(mb_type)){
2465             if(h->deblocking_filter)
2466                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2467
2468             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2469                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2470                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2471             }
2472
2473             if(IS_INTRA4x4(mb_type)){
2474                 if(simple || !s->encoding){
2475                     if(IS_8x8DCT(mb_type)){
2476                         if(transform_bypass){
2477                             idct_dc_add =
2478                             idct_add    = s->dsp.add_pixels8;
2479                         }else{
2480                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2481                             idct_add    = s->dsp.h264_idct8_add;
2482                         }
2483                         for(i=0; i<16; i+=4){
2484                             uint8_t * const ptr= dest_y + block_offset[i];
2485                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2486                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2487                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2488                             }else{
2489                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2490                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2491                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2492                                 if(nnz){
2493                                     if(nnz == 1 && h->mb[i*16])
2494                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2495                                     else
2496                                         idct_add   (ptr, h->mb + i*16, linesize);
2497                                 }
2498                             }
2499                         }
2500                     }else{
2501                         if(transform_bypass){
2502                             idct_dc_add =
2503                             idct_add    = s->dsp.add_pixels4;
2504                         }else{
2505                             idct_dc_add = s->dsp.h264_idct_dc_add;
2506                             idct_add    = s->dsp.h264_idct_add;
2507                         }
2508                         for(i=0; i<16; i++){
2509                             uint8_t * const ptr= dest_y + block_offset[i];
2510                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2511
2512                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2513                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2514                             }else{
2515                                 uint8_t *topright;
2516                                 int nnz, tr;
2517                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2518                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2519                                     assert(mb_y || linesize <= block_offset[i]);
2520                                     if(!topright_avail){
2521                                         tr= ptr[3 - linesize]*0x01010101;
2522                                         topright= (uint8_t*) &tr;
2523                                     }else
2524                                         topright= ptr + 4 - linesize;
2525                                 }else
2526                                     topright= NULL;
2527
2528                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2529                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2530                                 if(nnz){
2531                                     if(is_h264){
2532                                         if(nnz == 1 && h->mb[i*16])
2533                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2534                                         else
2535                                             idct_add   (ptr, h->mb + i*16, linesize);
2536                                     }else
2537                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2538                                 }
2539                             }
2540                         }
2541                     }
2542                 }
2543             }else{
2544                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2545                 if(is_h264){
2546                     if(!transform_bypass)
2547                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2548                 }else
2549                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2550             }
2551             if(h->deblocking_filter)
2552                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2553         }else if(is_h264){
2554             hl_motion(h, dest_y, dest_cb, dest_cr,
2555                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2556                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2557                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2558         }
2559
2560
2561         if(!IS_INTRA4x4(mb_type)){
2562             if(is_h264){
2563                 if(IS_INTRA16x16(mb_type)){
2564                     if(transform_bypass){
2565                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2566                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2567                         }else{
2568                             for(i=0; i<16; i++){
2569                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2570                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2571                             }
2572                         }
2573                     }else{
2574                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2575                     }
2576                 }else if(h->cbp&15){
2577                     if(transform_bypass){
2578                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2579                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2580                         for(i=0; i<16; i+=di){
2581                             if(h->non_zero_count_cache[ scan8[i] ]){
2582                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2583                             }
2584                         }
2585                     }else{
2586                         if(IS_8x8DCT(mb_type)){
2587                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2588                         }else{
2589                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2590                         }
2591                     }
2592                 }
2593             }else{
2594                 for(i=0; i<16; i++){
2595                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2596                         uint8_t * const ptr= dest_y + block_offset[i];
2597                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2598                     }
2599                 }
2600             }
2601         }
2602
2603         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2604             uint8_t *dest[2] = {dest_cb, dest_cr};
2605             if(transform_bypass){
2606                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2607                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2608                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2609                 }else{
2610                     idct_add = s->dsp.add_pixels4;
2611                     for(i=16; i<16+8; i++){
2612                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2613                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2614                     }
2615                 }
2616             }else{
2617                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2618                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2619                 if(is_h264){
2620                     idct_add = s->dsp.h264_idct_add;
2621                     idct_dc_add = s->dsp.h264_idct_dc_add;
2622                     for(i=16; i<16+8; i++){
2623                         if(h->non_zero_count_cache[ scan8[i] ])
2624                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2625                         else if(h->mb[i*16])
2626                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2627                     }
2628                 }else{
2629                     for(i=16; i<16+8; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2631                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2632                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2633                         }
2634                     }
2635                 }
2636             }
2637         }
2638     }
2639     if(h->cbp || IS_INTRA(mb_type))
2640         s->dsp.clear_blocks(h->mb);
2641
2642     if(h->deblocking_filter) {
2643         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2644         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2645         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2646         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2647         if (!simple && FRAME_MBAFF) {
2648             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2649         } else {
2650             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2651         }
2652     }
2653 }
2654
2655 /**
2656  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2657  */
2658 static void hl_decode_mb_simple(H264Context *h){
2659     hl_decode_mb_internal(h, 1);
2660 }
2661
2662 /**
2663  * Process a macroblock; this handles edge cases, such as interlacing.
2664  */
2665 static void av_noinline hl_decode_mb_complex(H264Context *h){
2666     hl_decode_mb_internal(h, 0);
2667 }
2668
2669 static void hl_decode_mb(H264Context *h){
2670     MpegEncContext * const s = &h->s;
2671     const int mb_xy= h->mb_xy;
2672     const int mb_type= s->current_picture.mb_type[mb_xy];
2673     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2674
2675     if (is_complex)
2676         hl_decode_mb_complex(h);
2677     else hl_decode_mb_simple(h);
2678 }
2679
2680 static void pic_as_field(Picture *pic, const int parity){
2681     int i;
2682     for (i = 0; i < 4; ++i) {
2683         if (parity == PICT_BOTTOM_FIELD)
2684             pic->data[i] += pic->linesize[i];
2685         pic->reference = parity;
2686         pic->linesize[i] *= 2;
2687     }
2688     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2689 }
2690
2691 static int split_field_copy(Picture *dest, Picture *src,
2692                             int parity, int id_add){
2693     int match = !!(src->reference & parity);
2694
2695     if (match) {
2696         *dest = *src;
2697         if(parity != PICT_FRAME){
2698             pic_as_field(dest, parity);
2699             dest->pic_id *= 2;
2700             dest->pic_id += id_add;
2701         }
2702     }
2703
2704     return match;
2705 }
2706
2707 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2708     int i[2]={0};
2709     int index=0;
2710
2711     while(i[0]<len || i[1]<len){
2712         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2713             i[0]++;
2714         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2715             i[1]++;
2716         if(i[0] < len){
2717             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2718             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2719         }
2720         if(i[1] < len){
2721             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2722             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2723         }
2724     }
2725
2726     return index;
2727 }
2728
2729 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2730     int i, best_poc;
2731     int out_i= 0;
2732
2733     for(;;){
2734         best_poc= dir ? INT_MIN : INT_MAX;
2735
2736         for(i=0; i<len; i++){
2737             const int poc= src[i]->poc;
2738             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2739                 best_poc= poc;
2740                 sorted[out_i]= src[i];
2741             }
2742         }
2743         if(best_poc == (dir ? INT_MIN : INT_MAX))
2744             break;
2745         limit= sorted[out_i++]->poc - dir;
2746     }
2747     return out_i;
2748 }
2749
2750 /**
2751  * fills the default_ref_list.
2752  */
2753 static int fill_default_ref_list(H264Context *h){
2754     MpegEncContext * const s = &h->s;
2755     int i, len;
2756
2757     if(h->slice_type_nos==FF_B_TYPE){
2758         Picture *sorted[32];
2759         int cur_poc, list;
2760         int lens[2];
2761
2762         if(FIELD_PICTURE)
2763             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2764         else
2765             cur_poc= s->current_picture_ptr->poc;
2766
2767         for(list= 0; list<2; list++){
2768             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2769             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2770             assert(len<=32);
2771             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2772             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2773             assert(len<=32);
2774
2775             if(len < h->ref_count[list])
2776                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2777             lens[list]= len;
2778         }
2779
2780         if(lens[0] == lens[1] && lens[1] > 1){
2781             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2782             if(i == lens[0])
2783                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2784         }
2785     }else{
2786         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2787         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2788         assert(len <= 32);
2789         if(len < h->ref_count[0])
2790             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2791     }
2792 #ifdef TRACE
2793     for (i=0; i<h->ref_count[0]; i++) {
2794         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2795     }
2796     if(h->slice_type_nos==FF_B_TYPE){
2797         for (i=0; i<h->ref_count[1]; i++) {
2798             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2799         }
2800     }
2801 #endif
2802     return 0;
2803 }
2804
2805 static void print_short_term(H264Context *h);
2806 static void print_long_term(H264Context *h);
2807
2808 /**
2809  * Extract structure information about the picture described by pic_num in
2810  * the current decoding context (frame or field). Note that pic_num is
2811  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2812  * @param pic_num picture number for which to extract structure information
2813  * @param structure one of PICT_XXX describing structure of picture
2814  *                      with pic_num
2815  * @return frame number (short term) or long term index of picture
2816  *         described by pic_num
2817  */
2818 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2819     MpegEncContext * const s = &h->s;
2820
2821     *structure = s->picture_structure;
2822     if(FIELD_PICTURE){
2823         if (!(pic_num & 1))
2824             /* opposite field */
2825             *structure ^= PICT_FRAME;
2826         pic_num >>= 1;
2827     }
2828
2829     return pic_num;
2830 }
2831
2832 static int decode_ref_pic_list_reordering(H264Context *h){
2833     MpegEncContext * const s = &h->s;
2834     int list, index, pic_structure;
2835
2836     print_short_term(h);
2837     print_long_term(h);
2838
2839     for(list=0; list<h->list_count; list++){
2840         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2841
2842         if(get_bits1(&s->gb)){
2843             int pred= h->curr_pic_num;
2844
2845             for(index=0; ; index++){
2846                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2847                 unsigned int pic_id;
2848                 int i;
2849                 Picture *ref = NULL;
2850
2851                 if(reordering_of_pic_nums_idc==3)
2852                     break;
2853
2854                 if(index >= h->ref_count[list]){
2855                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2856                     return -1;
2857                 }
2858
2859                 if(reordering_of_pic_nums_idc<3){
2860                     if(reordering_of_pic_nums_idc<2){
2861                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2862                         int frame_num;
2863
2864                         if(abs_diff_pic_num > h->max_pic_num){
2865                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2866                             return -1;
2867                         }
2868
2869                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2870                         else                                pred+= abs_diff_pic_num;
2871                         pred &= h->max_pic_num - 1;
2872
2873                         frame_num = pic_num_extract(h, pred, &pic_structure);
2874
2875                         for(i= h->short_ref_count-1; i>=0; i--){
2876                             ref = h->short_ref[i];
2877                             assert(ref->reference);
2878                             assert(!ref->long_ref);
2879                             if(
2880                                    ref->frame_num == frame_num &&
2881                                    (ref->reference & pic_structure)
2882                               )
2883                                 break;
2884                         }
2885                         if(i>=0)
2886                             ref->pic_id= pred;
2887                     }else{
2888                         int long_idx;
2889                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2890
2891                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2892
2893                         if(long_idx>31){
2894                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2895                             return -1;
2896                         }
2897                         ref = h->long_ref[long_idx];
2898                         assert(!(ref && !ref->reference));
2899                         if(ref && (ref->reference & pic_structure)){
2900                             ref->pic_id= pic_id;
2901                             assert(ref->long_ref);
2902                             i=0;
2903                         }else{
2904                             i=-1;
2905                         }
2906                     }
2907
2908                     if (i < 0) {
2909                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2910                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2911                     } else {
2912                         for(i=index; i+1<h->ref_count[list]; i++){
2913                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2914                                 break;
2915                         }
2916                         for(; i > index; i--){
2917                             h->ref_list[list][i]= h->ref_list[list][i-1];
2918                         }
2919                         h->ref_list[list][index]= *ref;
2920                         if (FIELD_PICTURE){
2921                             pic_as_field(&h->ref_list[list][index], pic_structure);
2922                         }
2923                     }
2924                 }else{
2925                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2926                     return -1;
2927                 }
2928             }
2929         }
2930     }
2931     for(list=0; list<h->list_count; list++){
2932         for(index= 0; index < h->ref_count[list]; index++){
2933             if(!h->ref_list[list][index].data[0]){
2934                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2935                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2936             }
2937         }
2938     }
2939
2940     return 0;
2941 }
2942
2943 static void fill_mbaff_ref_list(H264Context *h){
2944     int list, i, j;
2945     for(list=0; list<2; list++){ //FIXME try list_count
2946         for(i=0; i<h->ref_count[list]; i++){
2947             Picture *frame = &h->ref_list[list][i];
2948             Picture *field = &h->ref_list[list][16+2*i];
2949             field[0] = *frame;
2950             for(j=0; j<3; j++)
2951                 field[0].linesize[j] <<= 1;
2952             field[0].reference = PICT_TOP_FIELD;
2953             field[0].poc= field[0].field_poc[0];
2954             field[1] = field[0];
2955             for(j=0; j<3; j++)
2956                 field[1].data[j] += frame->linesize[j];
2957             field[1].reference = PICT_BOTTOM_FIELD;
2958             field[1].poc= field[1].field_poc[1];
2959
2960             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2961             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2962             for(j=0; j<2; j++){
2963                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2964                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2965             }
2966         }
2967     }
2968     for(j=0; j<h->ref_count[1]; j++){
2969         for(i=0; i<h->ref_count[0]; i++)
2970             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2971         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2972         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2973     }
2974 }
2975
2976 static int pred_weight_table(H264Context *h){
2977     MpegEncContext * const s = &h->s;
2978     int list, i;
2979     int luma_def, chroma_def;
2980
2981     h->use_weight= 0;
2982     h->use_weight_chroma= 0;
2983     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2984     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2985     luma_def = 1<<h->luma_log2_weight_denom;
2986     chroma_def = 1<<h->chroma_log2_weight_denom;
2987
2988     for(list=0; list<2; list++){
2989         h->luma_weight_flag[list]   = 0;
2990         h->chroma_weight_flag[list] = 0;
2991         for(i=0; i<h->ref_count[list]; i++){
2992             int luma_weight_flag, chroma_weight_flag;
2993
2994             luma_weight_flag= get_bits1(&s->gb);
2995             if(luma_weight_flag){
2996                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2997                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2998                 if(   h->luma_weight[list][i] != luma_def
2999                    || h->luma_offset[list][i] != 0) {
3000                     h->use_weight= 1;
3001                     h->luma_weight_flag[list]= 1;
3002                 }
3003             }else{
3004                 h->luma_weight[list][i]= luma_def;
3005                 h->luma_offset[list][i]= 0;
3006             }
3007
3008             if(CHROMA){
3009                 chroma_weight_flag= get_bits1(&s->gb);
3010                 if(chroma_weight_flag){
3011                     int j;
3012                     for(j=0; j<2; j++){
3013                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3014                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3015                         if(   h->chroma_weight[list][i][j] != chroma_def
3016                            || h->chroma_offset[list][i][j] != 0) {
3017                             h->use_weight_chroma= 1;
3018                             h->chroma_weight_flag[list]= 1;
3019                         }
3020                     }
3021                 }else{
3022                     int j;
3023                     for(j=0; j<2; j++){
3024                         h->chroma_weight[list][i][j]= chroma_def;
3025                         h->chroma_offset[list][i][j]= 0;
3026                     }
3027                 }
3028             }
3029         }
3030         if(h->slice_type_nos != FF_B_TYPE) break;
3031     }
3032     h->use_weight= h->use_weight || h->use_weight_chroma;
3033     return 0;
3034 }
3035
3036 static void implicit_weight_table(H264Context *h){
3037     MpegEncContext * const s = &h->s;
3038     int ref0, ref1, i;
3039     int cur_poc = s->current_picture_ptr->poc;
3040
3041     for (i = 0; i < 2; i++) {
3042         h->luma_weight_flag[i]   = 0;
3043         h->chroma_weight_flag[i] = 0;
3044     }
3045
3046     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3047        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3048         h->use_weight= 0;
3049         h->use_weight_chroma= 0;
3050         return;
3051     }
3052
3053     h->use_weight= 2;
3054     h->use_weight_chroma= 2;
3055     h->luma_log2_weight_denom= 5;
3056     h->chroma_log2_weight_denom= 5;
3057
3058     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3059         int poc0 = h->ref_list[0][ref0].poc;
3060         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3061             int poc1 = h->ref_list[1][ref1].poc;
3062             int td = av_clip(poc1 - poc0, -128, 127);
3063             if(td){
3064                 int tb = av_clip(cur_poc - poc0, -128, 127);
3065                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3066                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3067                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3068                     h->implicit_weight[ref0][ref1] = 32;
3069                 else
3070                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3071             }else
3072                 h->implicit_weight[ref0][ref1] = 32;
3073         }
3074     }
3075 }
3076
3077 /**
3078  * Mark a picture as no longer needed for reference. The refmask
3079  * argument allows unreferencing of individual fields or the whole frame.
3080  * If the picture becomes entirely unreferenced, but is being held for
3081  * display purposes, it is marked as such.
3082  * @param refmask mask of fields to unreference; the mask is bitwise
3083  *                anded with the reference marking of pic
3084  * @return non-zero if pic becomes entirely unreferenced (except possibly
3085  *         for display purposes) zero if one of the fields remains in
3086  *         reference
3087  */
3088 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3089     int i;
3090     if (pic->reference &= refmask) {
3091         return 0;
3092     } else {
3093         for(i = 0; h->delayed_pic[i]; i++)
3094             if(pic == h->delayed_pic[i]){
3095                 pic->reference=DELAYED_PIC_REF;
3096                 break;
3097             }
3098         return 1;
3099     }
3100 }
3101
3102 /**
3103  * instantaneous decoder refresh.
3104  */
3105 static void idr(H264Context *h){
3106     int i;
3107
3108     for(i=0; i<16; i++){
3109         remove_long(h, i, 0);
3110     }
3111     assert(h->long_ref_count==0);
3112
3113     for(i=0; i<h->short_ref_count; i++){
3114         unreference_pic(h, h->short_ref[i], 0);
3115         h->short_ref[i]= NULL;
3116     }
3117     h->short_ref_count=0;
3118     h->prev_frame_num= 0;
3119     h->prev_frame_num_offset= 0;
3120     h->prev_poc_msb=
3121     h->prev_poc_lsb= 0;
3122 }
3123
3124 /* forget old pics after a seek */
3125 static void flush_dpb(AVCodecContext *avctx){
3126     H264Context *h= avctx->priv_data;
3127     int i;
3128     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3129         if(h->delayed_pic[i])
3130             h->delayed_pic[i]->reference= 0;
3131         h->delayed_pic[i]= NULL;
3132     }
3133     h->outputed_poc= INT_MIN;
3134     idr(h);
3135     if(h->s.current_picture_ptr)
3136         h->s.current_picture_ptr->reference= 0;
3137     h->s.first_field= 0;
3138     h->sei_recovery_frame_cnt = -1;
3139     h->sei_dpb_output_delay = 0;
3140     h->sei_cpb_removal_delay = -1;
3141     h->sei_buffering_period_present = 0;
3142     ff_mpeg_flush(avctx);
3143 }
3144
3145 /**
3146  * Find a Picture in the short term reference list by frame number.
3147  * @param frame_num frame number to search for
3148  * @param idx the index into h->short_ref where returned picture is found
3149  *            undefined if no picture found.
3150  * @return pointer to the found picture, or NULL if no pic with the provided
3151  *                 frame number is found
3152  */
3153 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3154     MpegEncContext * const s = &h->s;
3155     int i;
3156
3157     for(i=0; i<h->short_ref_count; i++){
3158         Picture *pic= h->short_ref[i];
3159         if(s->avctx->debug&FF_DEBUG_MMCO)
3160             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3161         if(pic->frame_num == frame_num) {
3162             *idx = i;
3163             return pic;
3164         }
3165     }
3166     return NULL;
3167 }
3168
3169 /**
3170  * Remove a picture from the short term reference list by its index in
3171  * that list.  This does no checking on the provided index; it is assumed
3172  * to be valid. Other list entries are shifted down.
3173  * @param i index into h->short_ref of picture to remove.
3174  */
3175 static void remove_short_at_index(H264Context *h, int i){
3176     assert(i >= 0 && i < h->short_ref_count);
3177     h->short_ref[i]= NULL;
3178     if (--h->short_ref_count)
3179         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3180 }
3181
3182 /**
3183  *
3184  * @return the removed picture or NULL if an error occurs
3185  */
3186 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3187     MpegEncContext * const s = &h->s;
3188     Picture *pic;
3189     int i;
3190
3191     if(s->avctx->debug&FF_DEBUG_MMCO)
3192         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3193
3194     pic = find_short(h, frame_num, &i);
3195     if (pic){
3196         if(unreference_pic(h, pic, ref_mask))
3197         remove_short_at_index(h, i);
3198     }
3199
3200     return pic;
3201 }
3202
3203 /**
3204  * Remove a picture from the long term reference list by its index in
3205  * that list.
3206  * @return the removed picture or NULL if an error occurs
3207  */
3208 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3209     Picture *pic;
3210
3211     pic= h->long_ref[i];
3212     if (pic){
3213         if(unreference_pic(h, pic, ref_mask)){
3214             assert(h->long_ref[i]->long_ref == 1);
3215             h->long_ref[i]->long_ref= 0;
3216             h->long_ref[i]= NULL;
3217             h->long_ref_count--;
3218         }
3219     }
3220
3221     return pic;
3222 }
3223
3224 /**
3225  * print short term list
3226  */
3227 static void print_short_term(H264Context *h) {
3228     uint32_t i;
3229     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3230         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3231         for(i=0; i<h->short_ref_count; i++){
3232             Picture *pic= h->short_ref[i];
3233             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3234         }
3235     }
3236 }
3237
3238 /**
3239  * print long term list
3240  */
3241 static void print_long_term(H264Context *h) {
3242     uint32_t i;
3243     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3244         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3245         for(i = 0; i < 16; i++){
3246             Picture *pic= h->long_ref[i];
3247             if (pic) {
3248                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3249             }
3250         }
3251     }
3252 }
3253
3254 /**
3255  * Executes the reference picture marking (memory management control operations).
3256  */
3257 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3258     MpegEncContext * const s = &h->s;
3259     int i, j;
3260     int current_ref_assigned=0;
3261     Picture *av_uninit(pic);
3262
3263     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3264         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3265
3266     for(i=0; i<mmco_count; i++){
3267         int structure, av_uninit(frame_num);
3268         if(s->avctx->debug&FF_DEBUG_MMCO)
3269             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3270
3271         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3272            || mmco[i].opcode == MMCO_SHORT2LONG){
3273             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3274             pic = find_short(h, frame_num, &j);
3275             if(!pic){
3276                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3277                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3278                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3279                 continue;
3280             }
3281         }
3282
3283         switch(mmco[i].opcode){
3284         case MMCO_SHORT2UNUSED:
3285             if(s->avctx->debug&FF_DEBUG_MMCO)
3286                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3287             remove_short(h, frame_num, structure ^ PICT_FRAME);
3288             break;
3289         case MMCO_SHORT2LONG:
3290                 if (h->long_ref[mmco[i].long_arg] != pic)
3291                     remove_long(h, mmco[i].long_arg, 0);
3292
3293                 remove_short_at_index(h, j);
3294                 h->long_ref[ mmco[i].long_arg ]= pic;
3295                 if (h->long_ref[ mmco[i].long_arg ]){
3296                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3297                     h->long_ref_count++;
3298                 }
3299             break;
3300         case MMCO_LONG2UNUSED:
3301             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3302             pic = h->long_ref[j];
3303             if (pic) {
3304                 remove_long(h, j, structure ^ PICT_FRAME);
3305             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3306                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3307             break;
3308         case MMCO_LONG:
3309                     // Comment below left from previous code as it is an interresting note.
3310                     /* First field in pair is in short term list or
3311                      * at a different long term index.
3312                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3313                      * Report the problem and keep the pair where it is,
3314                      * and mark this field valid.
3315                      */
3316
3317             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3318                 remove_long(h, mmco[i].long_arg, 0);
3319
3320                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3321                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3322                 h->long_ref_count++;
3323             }
3324
3325             s->current_picture_ptr->reference |= s->picture_structure;
3326             current_ref_assigned=1;
3327             break;
3328         case MMCO_SET_MAX_LONG:
3329             assert(mmco[i].long_arg <= 16);
3330             // just remove the long term which index is greater than new max
3331             for(j = mmco[i].long_arg; j<16; j++){
3332                 remove_long(h, j, 0);
3333             }
3334             break;
3335         case MMCO_RESET:
3336             while(h->short_ref_count){
3337                 remove_short(h, h->short_ref[0]->frame_num, 0);
3338             }
3339             for(j = 0; j < 16; j++) {
3340                 remove_long(h, j, 0);
3341             }
3342             s->current_picture_ptr->poc=
3343             s->current_picture_ptr->field_poc[0]=
3344             s->current_picture_ptr->field_poc[1]=
3345             h->poc_lsb=
3346             h->poc_msb=
3347             h->frame_num=
3348             s->current_picture_ptr->frame_num= 0;
3349             break;
3350         default: assert(0);
3351         }
3352     }
3353
3354     if (!current_ref_assigned) {
3355         /* Second field of complementary field pair; the first field of
3356          * which is already referenced. If short referenced, it
3357          * should be first entry in short_ref. If not, it must exist
3358          * in long_ref; trying to put it on the short list here is an
3359          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3360          */
3361         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3362             /* Just mark the second field valid */
3363             s->current_picture_ptr->reference = PICT_FRAME;
3364         } else if (s->current_picture_ptr->long_ref) {
3365             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3366                                              "assignment for second field "
3367                                              "in complementary field pair "
3368                                              "(first field is long term)\n");
3369         } else {
3370             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3371             if(pic){
3372                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3373             }
3374
3375             if(h->short_ref_count)
3376                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3377
3378             h->short_ref[0]= s->current_picture_ptr;
3379             h->short_ref_count++;
3380             s->current_picture_ptr->reference |= s->picture_structure;
3381         }
3382     }
3383
3384     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3385
3386         /* We have too many reference frames, probably due to corrupted
3387          * stream. Need to discard one frame. Prevents overrun of the
3388          * short_ref and long_ref buffers.
3389          */
3390         av_log(h->s.avctx, AV_LOG_ERROR,
3391                "number of reference frames exceeds max (probably "
3392                "corrupt input), discarding one\n");
3393
3394         if (h->long_ref_count && !h->short_ref_count) {
3395             for (i = 0; i < 16; ++i)
3396                 if (h->long_ref[i])
3397                     break;
3398
3399             assert(i < 16);
3400             remove_long(h, i, 0);
3401         } else {
3402             pic = h->short_ref[h->short_ref_count - 1];
3403             remove_short(h, pic->frame_num, 0);
3404         }
3405     }
3406
3407     print_short_term(h);
3408     print_long_term(h);
3409     return 0;
3410 }
3411
3412 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3413     MpegEncContext * const s = &h->s;
3414     int i;
3415
3416     h->mmco_index= 0;
3417     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3418         s->broken_link= get_bits1(gb) -1;
3419         if(get_bits1(gb)){
3420             h->mmco[0].opcode= MMCO_LONG;
3421             h->mmco[0].long_arg= 0;
3422             h->mmco_index= 1;
3423         }
3424     }else{
3425         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3426             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3427                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3428
3429                 h->mmco[i].opcode= opcode;
3430                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3431                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3432 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3433                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3434                         return -1;
3435                     }*/
3436                 }
3437                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3438                     unsigned int long_arg= get_ue_golomb_31(gb);
3439                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3440                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3441                         return -1;
3442                     }
3443                     h->mmco[i].long_arg= long_arg;
3444                 }
3445
3446                 if(opcode > (unsigned)MMCO_LONG){
3447                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3448                     return -1;
3449                 }
3450                 if(opcode == MMCO_END)
3451                     break;
3452             }
3453             h->mmco_index= i;
3454         }else{
3455             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3456
3457             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3458                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3459                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3460                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3461                 h->mmco_index= 1;
3462                 if (FIELD_PICTURE) {
3463                     h->mmco[0].short_pic_num *= 2;
3464                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3465                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3466                     h->mmco_index= 2;
3467                 }
3468             }
3469         }
3470     }
3471
3472     return 0;
3473 }
3474
3475 static int init_poc(H264Context *h){
3476     MpegEncContext * const s = &h->s;
3477     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3478     int field_poc[2];
3479     Picture *cur = s->current_picture_ptr;
3480
3481     h->frame_num_offset= h->prev_frame_num_offset;
3482     if(h->frame_num < h->prev_frame_num)
3483         h->frame_num_offset += max_frame_num;
3484
3485     if(h->sps.poc_type==0){
3486         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3487
3488         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3489             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3490         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3491             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3492         else
3493             h->poc_msb = h->prev_poc_msb;
3494 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3495         field_poc[0] =
3496         field_poc[1] = h->poc_msb + h->poc_lsb;
3497         if(s->picture_structure == PICT_FRAME)
3498             field_poc[1] += h->delta_poc_bottom;
3499     }else if(h->sps.poc_type==1){
3500         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3501         int i;
3502
3503         if(h->sps.poc_cycle_length != 0)
3504             abs_frame_num = h->frame_num_offset + h->frame_num;
3505         else
3506             abs_frame_num = 0;
3507
3508         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3509             abs_frame_num--;
3510
3511         expected_delta_per_poc_cycle = 0;
3512         for(i=0; i < h->sps.poc_cycle_length; i++)
3513             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3514
3515         if(abs_frame_num > 0){
3516             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3517             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3518
3519             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3520             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3521                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3522         } else
3523             expectedpoc = 0;
3524
3525         if(h->nal_ref_idc == 0)
3526             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3527
3528         field_poc[0] = expectedpoc + h->delta_poc[0];
3529         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3530
3531         if(s->picture_structure == PICT_FRAME)
3532             field_poc[1] += h->delta_poc[1];
3533     }else{
3534         int poc= 2*(h->frame_num_offset + h->frame_num);
3535
3536         if(!h->nal_ref_idc)
3537             poc--;
3538
3539         field_poc[0]= poc;
3540         field_poc[1]= poc;
3541     }
3542
3543     if(s->picture_structure != PICT_BOTTOM_FIELD)
3544         s->current_picture_ptr->field_poc[0]= field_poc[0];
3545     if(s->picture_structure != PICT_TOP_FIELD)
3546         s->current_picture_ptr->field_poc[1]= field_poc[1];
3547     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3548
3549     return 0;
3550 }
3551
3552
3553 /**
3554  * initialize scan tables
3555  */
3556 static void init_scan_tables(H264Context *h){
3557     MpegEncContext * const s = &h->s;
3558     int i;
3559     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3560         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3561         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3562     }else{
3563         for(i=0; i<16; i++){
3564 #define T(x) (x>>2) | ((x<<2) & 0xF)
3565             h->zigzag_scan[i] = T(zigzag_scan[i]);
3566             h-> field_scan[i] = T( field_scan[i]);
3567 #undef T
3568         }
3569     }
3570     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3571         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3572         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3573         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3574         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3575     }else{
3576         for(i=0; i<64; i++){
3577 #define T(x) (x>>3) | ((x&7)<<3)
3578             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3579             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3580             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3581             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3582 #undef T
3583         }
3584     }
3585     if(h->sps.transform_bypass){ //FIXME same ugly
3586         h->zigzag_scan_q0          = zigzag_scan;
3587         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3588         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3589         h->field_scan_q0           = field_scan;
3590         h->field_scan8x8_q0        = field_scan8x8;
3591         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3592     }else{
3593         h->zigzag_scan_q0          = h->zigzag_scan;
3594         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3595         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3596         h->field_scan_q0           = h->field_scan;
3597         h->field_scan8x8_q0        = h->field_scan8x8;
3598         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3599     }
3600 }
3601
3602 /**
3603  * Replicates H264 "master" context to thread contexts.
3604  */
3605 static void clone_slice(H264Context *dst, H264Context *src)
3606 {
3607     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3608     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3609     dst->s.current_picture      = src->s.current_picture;
3610     dst->s.linesize             = src->s.linesize;
3611     dst->s.uvlinesize           = src->s.uvlinesize;
3612     dst->s.first_field          = src->s.first_field;
3613
3614     dst->prev_poc_msb           = src->prev_poc_msb;
3615     dst->prev_poc_lsb           = src->prev_poc_lsb;
3616     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3617     dst->prev_frame_num         = src->prev_frame_num;
3618     dst->short_ref_count        = src->short_ref_count;
3619
3620     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3621     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3622     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3623     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3624
3625     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3626     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3627 }
3628
3629 /**
3630  * decodes a slice header.
3631  * This will also call MPV_common_init() and frame_start() as needed.
3632  *
3633  * @param h h264context
3634  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3635  *
3636  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3637  */
3638 static int decode_slice_header(H264Context *h, H264Context *h0){
3639     MpegEncContext * const s = &h->s;
3640     MpegEncContext * const s0 = &h0->s;
3641     unsigned int first_mb_in_slice;
3642     unsigned int pps_id;
3643     int num_ref_idx_active_override_flag;
3644     unsigned int slice_type, tmp, i, j;
3645     int default_ref_list_done = 0;
3646     int last_pic_structure;
3647
3648     s->dropable= h->nal_ref_idc == 0;
3649
3650     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3651         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3652         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3653     }else{
3654         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3655         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3656     }
3657
3658     first_mb_in_slice= get_ue_golomb(&s->gb);
3659
3660     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3661         h0->current_slice = 0;
3662         if (!s0->first_field)
3663             s->current_picture_ptr= NULL;
3664     }
3665
3666     slice_type= get_ue_golomb_31(&s->gb);
3667     if(slice_type > 9){
3668         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3669         return -1;
3670     }
3671     if(slice_type > 4){
3672         slice_type -= 5;
3673         h->slice_type_fixed=1;
3674     }else
3675         h->slice_type_fixed=0;
3676
3677     slice_type= golomb_to_pict_type[ slice_type ];
3678     if (slice_type == FF_I_TYPE
3679         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3680         default_ref_list_done = 1;
3681     }
3682     h->slice_type= slice_type;
3683     h->slice_type_nos= slice_type & 3;
3684
3685     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3686     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3687         av_log(h->s.avctx, AV_LOG_ERROR,
3688                "B picture before any references, skipping\n");
3689         return -1;
3690     }
3691
3692     pps_id= get_ue_golomb(&s->gb);
3693     if(pps_id>=MAX_PPS_COUNT){
3694         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3695         return -1;
3696     }
3697     if(!h0->pps_buffers[pps_id]) {
3698         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3699         return -1;
3700     }
3701     h->pps= *h0->pps_buffers[pps_id];
3702
3703     if(!h0->sps_buffers[h->pps.sps_id]) {
3704         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3705         return -1;
3706     }
3707     h->sps = *h0->sps_buffers[h->pps.sps_id];
3708
3709     if(h == h0 && h->dequant_coeff_pps != pps_id){
3710         h->dequant_coeff_pps = pps_id;
3711         init_dequant_tables(h);
3712     }
3713
3714     s->mb_width= h->sps.mb_width;
3715     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3716
3717     h->b_stride=  s->mb_width*4;
3718     h->b8_stride= s->mb_width*2;
3719
3720     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3721     if(h->sps.frame_mbs_only_flag)
3722         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3723     else
3724         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3725
3726     if (s->context_initialized
3727         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3728         if(h != h0)
3729             return -1;   // width / height changed during parallelized decoding
3730         free_tables(h);
3731         flush_dpb(s->avctx);
3732         MPV_common_end(s);
3733     }
3734     if (!s->context_initialized) {
3735         if(h != h0)
3736             return -1;  // we cant (re-)initialize context during parallel decoding
3737         if (MPV_common_init(s) < 0)
3738             return -1;
3739         s->first_field = 0;
3740
3741         init_scan_tables(h);
3742         alloc_tables(h);
3743
3744         for(i = 1; i < s->avctx->thread_count; i++) {
3745             H264Context *c;
3746             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3747             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3748             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3749             c->sps = h->sps;
3750             c->pps = h->pps;
3751             init_scan_tables(c);
3752             clone_tables(c, h);
3753         }
3754
3755         for(i = 0; i < s->avctx->thread_count; i++)
3756             if(context_init(h->thread_context[i]) < 0)
3757                 return -1;
3758
3759         s->avctx->width = s->width;
3760         s->avctx->height = s->height;
3761         s->avctx->sample_aspect_ratio= h->sps.sar;
3762         if(!s->avctx->sample_aspect_ratio.den)
3763             s->avctx->sample_aspect_ratio.den = 1;
3764
3765         if(h->sps.timing_info_present_flag){
3766             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3767             if(h->x264_build > 0 && h->x264_build < 44)
3768                 s->avctx->time_base.den *= 2;
3769             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3770                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3771         }
3772     }
3773
3774     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3775
3776     h->mb_mbaff = 0;
3777     h->mb_aff_frame = 0;
3778     last_pic_structure = s0->picture_structure;
3779     if(h->sps.frame_mbs_only_flag){
3780         s->picture_structure= PICT_FRAME;
3781     }else{
3782         if(get_bits1(&s->gb)) { //field_pic_flag
3783             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3784         } else {
3785             s->picture_structure= PICT_FRAME;
3786             h->mb_aff_frame = h->sps.mb_aff;
3787         }
3788     }
3789     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3790
3791     if(h0->current_slice == 0){
3792         while(h->frame_num !=  h->prev_frame_num &&
3793               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3794             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3795             if (frame_start(h) < 0)
3796                 return -1;
3797             h->prev_frame_num++;
3798             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3799             s->current_picture_ptr->frame_num= h->prev_frame_num;
3800             execute_ref_pic_marking(h, NULL, 0);
3801         }
3802
3803         /* See if we have a decoded first field looking for a pair... */
3804         if (s0->first_field) {
3805             assert(s0->current_picture_ptr);
3806             assert(s0->current_picture_ptr->data[0]);
3807             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3808
3809             /* figure out if we have a complementary field pair */
3810             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3811                 /*
3812                  * Previous field is unmatched. Don't display it, but let it
3813                  * remain for reference if marked as such.
3814                  */
3815                 s0->current_picture_ptr = NULL;
3816                 s0->first_field = FIELD_PICTURE;
3817
3818             } else {
3819                 if (h->nal_ref_idc &&
3820                         s0->current_picture_ptr->reference &&
3821                         s0->current_picture_ptr->frame_num != h->frame_num) {
3822                     /*
3823                      * This and previous field were reference, but had
3824                      * different frame_nums. Consider this field first in
3825                      * pair. Throw away previous field except for reference
3826                      * purposes.
3827                      */
3828                     s0->first_field = 1;
3829                     s0->current_picture_ptr = NULL;
3830
3831                 } else {
3832                     /* Second field in complementary pair */
3833                     s0->first_field = 0;
3834                 }
3835             }
3836
3837         } else {
3838             /* Frame or first field in a potentially complementary pair */
3839             assert(!s0->current_picture_ptr);
3840             s0->first_field = FIELD_PICTURE;
3841         }
3842
3843         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3844             s0->first_field = 0;
3845             return -1;
3846         }
3847     }
3848     if(h != h0)
3849         clone_slice(h, h0);
3850
3851     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3852
3853     assert(s->mb_num == s->mb_width * s->mb_height);
3854     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3855        first_mb_in_slice                    >= s->mb_num){
3856         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3857         return -1;
3858     }
3859     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3860     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3861     if (s->picture_structure == PICT_BOTTOM_FIELD)
3862         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3863     assert(s->mb_y < s->mb_height);
3864
3865     if(s->picture_structure==PICT_FRAME){
3866         h->curr_pic_num=   h->frame_num;
3867         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3868     }else{
3869         h->curr_pic_num= 2*h->frame_num + 1;
3870         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3871     }
3872
3873     if(h->nal_unit_type == NAL_IDR_SLICE){
3874         get_ue_golomb(&s->gb); /* idr_pic_id */
3875     }
3876
3877     if(h->sps.poc_type==0){
3878         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3879
3880         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3881             h->delta_poc_bottom= get_se_golomb(&s->gb);
3882         }
3883     }
3884
3885     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3886         h->delta_poc[0]= get_se_golomb(&s->gb);
3887
3888         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3889             h->delta_poc[1]= get_se_golomb(&s->gb);
3890     }
3891
3892     init_poc(h);
3893
3894     if(h->pps.redundant_pic_cnt_present){
3895         h->redundant_pic_count= get_ue_golomb(&s->gb);
3896     }
3897
3898     //set defaults, might be overridden a few lines later
3899     h->ref_count[0]= h->pps.ref_count[0];
3900     h->ref_count[1]= h->pps.ref_count[1];
3901
3902     if(h->slice_type_nos != FF_I_TYPE){
3903         if(h->slice_type_nos == FF_B_TYPE){
3904             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3905         }
3906         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3907
3908         if(num_ref_idx_active_override_flag){
3909             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3910             if(h->slice_type_nos==FF_B_TYPE)
3911                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3912
3913             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3914                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3915                 h->ref_count[0]= h->ref_count[1]= 1;
3916                 return -1;
3917             }
3918         }
3919         if(h->slice_type_nos == FF_B_TYPE)
3920             h->list_count= 2;
3921         else
3922             h->list_count= 1;
3923     }else
3924         h->list_count= 0;
3925
3926     if(!default_ref_list_done){
3927         fill_default_ref_list(h);
3928     }
3929
3930     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3931         return -1;
3932
3933     if(h->slice_type_nos!=FF_I_TYPE){
3934         s->last_picture_ptr= &h->ref_list[0][0];
3935         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3936     }
3937     if(h->slice_type_nos==FF_B_TYPE){
3938         s->next_picture_ptr= &h->ref_list[1][0];
3939         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3940     }
3941
3942     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3943        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3944         pred_weight_table(h);
3945     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3946         implicit_weight_table(h);
3947     else {
3948         h->use_weight = 0;
3949         for (i = 0; i < 2; i++) {
3950             h->luma_weight_flag[i]   = 0;
3951             h->chroma_weight_flag[i] = 0;
3952         }
3953     }
3954
3955     if(h->nal_ref_idc)
3956         decode_ref_pic_marking(h0, &s->gb);
3957
3958     if(FRAME_MBAFF)
3959         fill_mbaff_ref_list(h);
3960
3961     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3962         direct_dist_scale_factor(h);
3963     direct_ref_list_init(h);
3964
3965     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3966         tmp = get_ue_golomb_31(&s->gb);
3967         if(tmp > 2){
3968             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3969             return -1;
3970         }
3971         h->cabac_init_idc= tmp;
3972     }
3973
3974     h->last_qscale_diff = 0;
3975     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3976     if(tmp>51){
3977         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3978         return -1;
3979     }
3980     s->qscale= tmp;
3981     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3982     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3983     //FIXME qscale / qp ... stuff
3984     if(h->slice_type == FF_SP_TYPE){
3985         get_bits1(&s->gb); /* sp_for_switch_flag */
3986     }
3987     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3988         get_se_golomb(&s->gb); /* slice_qs_delta */
3989     }
3990
3991     h->deblocking_filter = 1;
3992     h->slice_alpha_c0_offset = 0;
3993     h->slice_beta_offset = 0;
3994     if( h->pps.deblocking_filter_parameters_present ) {
3995         tmp= get_ue_golomb_31(&s->gb);
3996         if(tmp > 2){
3997             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3998             return -1;
3999         }
4000         h->deblocking_filter= tmp;
4001         if(h->deblocking_filter < 2)
4002             h->deblocking_filter^= 1; // 1<->0
4003
4004         if( h->deblocking_filter ) {
4005             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4006             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4007         }
4008     }
4009
4010     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4011        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4012        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4013        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4014         h->deblocking_filter= 0;
4015
4016     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4017         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4018             /* Cheat slightly for speed:
4019                Do not bother to deblock across slices. */
4020             h->deblocking_filter = 2;
4021         } else {
4022             h0->max_contexts = 1;
4023             if(!h0->single_decode_warning) {
4024                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4025                 h0->single_decode_warning = 1;
4026             }
4027             if(h != h0)
4028                 return 1; // deblocking switched inside frame
4029         }
4030     }
4031
4032 #if 0 //FMO
4033     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4034         slice_group_change_cycle= get_bits(&s->gb, ?);
4035 #endif
4036
4037     h0->last_slice_type = slice_type;
4038     h->slice_num = ++h0->current_slice;
4039     if(h->slice_num >= MAX_SLICES){
4040         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4041     }
4042
4043     for(j=0; j<2; j++){
4044         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4045         ref2frm[0]=
4046         ref2frm[1]= -1;
4047         for(i=0; i<16; i++)
4048             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4049                           +(h->ref_list[j][i].reference&3);
4050         ref2frm[18+0]=
4051         ref2frm[18+1]= -1;
4052         for(i=16; i<48; i++)
4053             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4054                           +(h->ref_list[j][i].reference&3);
4055     }
4056
4057     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4058     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4059
4060     s->avctx->refs= h->sps.ref_frame_count;
4061
4062     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4063         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4064                h->slice_num,
4065                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4066                first_mb_in_slice,
4067                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4068                pps_id, h->frame_num,
4069                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4070                h->ref_count[0], h->ref_count[1],
4071                s->qscale,
4072                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4073                h->use_weight,
4074                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4075                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4076                );
4077     }
4078
4079     return 0;
4080 }
4081
4082 /**
4083  *
4084  */
4085 static inline int get_level_prefix(GetBitContext *gb){
4086     unsigned int buf;
4087     int log;
4088
4089     OPEN_READER(re, gb);
4090     UPDATE_CACHE(re, gb);
4091     buf=GET_CACHE(re, gb);
4092
4093     log= 32 - av_log2(buf);
4094 #ifdef TRACE
4095     print_bin(buf>>(32-log), log);
4096     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4097 #endif
4098
4099     LAST_SKIP_BITS(re, gb, log);
4100     CLOSE_READER(re, gb);
4101
4102     return log-1;
4103 }
4104
4105 static inline int get_dct8x8_allowed(H264Context *h){
4106     if(h->sps.direct_8x8_inference_flag)
4107         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4108     else
4109         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4110 }
4111
4112 /**
4113  * decodes a residual block.
4114  * @param n block index
4115  * @param scantable scantable
4116  * @param max_coeff number of coefficients in the block
4117  * @return <0 if an error occurred
4118  */
4119 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4120     MpegEncContext * const s = &h->s;
4121     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4122     int level[16];
4123     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4124
4125     //FIXME put trailing_onex into the context
4126
4127     if(n == CHROMA_DC_BLOCK_INDEX){
4128         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4129         total_coeff= coeff_token>>2;
4130     }else{
4131         if(n == LUMA_DC_BLOCK_INDEX){
4132             total_coeff= pred_non_zero_count(h, 0);
4133             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4134             total_coeff= coeff_token>>2;
4135         }else{
4136             total_coeff= pred_non_zero_count(h, n);
4137             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4138             total_coeff= coeff_token>>2;
4139             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4140         }
4141     }
4142
4143     //FIXME set last_non_zero?
4144
4145     if(total_coeff==0)
4146         return 0;
4147     if(total_coeff > (unsigned)max_coeff) {
4148         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4149         return -1;
4150     }
4151
4152     trailing_ones= coeff_token&3;
4153     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4154     assert(total_coeff<=16);
4155
4156     i = show_bits(gb, 3);
4157     skip_bits(gb, trailing_ones);
4158     level[0] = 1-((i&4)>>1);
4159     level[1] = 1-((i&2)   );
4160     level[2] = 1-((i&1)<<1);
4161
4162     if(trailing_ones<total_coeff) {
4163         int mask, prefix;
4164         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4165         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4166         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4167
4168         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4169         if(level_code >= 100){
4170             prefix= level_code - 100;
4171             if(prefix == LEVEL_TAB_BITS)
4172                 prefix += get_level_prefix(gb);
4173
4174             //first coefficient has suffix_length equal to 0 or 1
4175             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4176                 if(suffix_length)
4177                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4178                 else
4179                     level_code= (prefix<<suffix_length); //part
4180             }else if(prefix==14){
4181                 if(suffix_length)
4182                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4183                 else
4184                     level_code= prefix + get_bits(gb, 4); //part
4185             }else{
4186                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4187                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4188                 if(prefix>=16)
4189                     level_code += (1<<(prefix-3))-4096;
4190             }
4191
4192             if(trailing_ones < 3) level_code += 2;
4193
4194             suffix_length = 2;
4195             mask= -(level_code&1);
4196             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4197         }else{
4198             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4199
4200             suffix_length = 1;
4201             if(level_code + 3U > 6U)
4202                 suffix_length++;
4203             level[trailing_ones]= level_code;
4204         }
4205
4206         //remaining coefficients have suffix_length > 0
4207         for(i=trailing_ones+1;i<total_coeff;i++) {
4208             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4209             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4210             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4211
4212             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4213             if(level_code >= 100){
4214                 prefix= level_code - 100;
4215                 if(prefix == LEVEL_TAB_BITS){
4216                     prefix += get_level_prefix(gb);
4217                 }
4218                 if(prefix<15){
4219                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4220                 }else{
4221                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4222                     if(prefix>=16)
4223                         level_code += (1<<(prefix-3))-4096;
4224                 }
4225                 mask= -(level_code&1);
4226                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4227             }
4228             level[i]= level_code;
4229
4230             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4231                 suffix_length++;
4232         }
4233     }
4234
4235     if(total_coeff == max_coeff)
4236         zeros_left=0;
4237     else{
4238         if(n == CHROMA_DC_BLOCK_INDEX)
4239             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4240         else
4241             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4242     }
4243
4244     coeff_num = zeros_left + total_coeff - 1;
4245     j = scantable[coeff_num];
4246     if(n > 24){
4247         block[j] = level[0];
4248         for(i=1;i<total_coeff;i++) {
4249             if(zeros_left <= 0)
4250                 run_before = 0;
4251             else if(zeros_left < 7){
4252                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4253             }else{
4254                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4255             }
4256             zeros_left -= run_before;
4257             coeff_num -= 1 + run_before;
4258             j= scantable[ coeff_num ];
4259
4260             block[j]= level[i];
4261         }
4262     }else{
4263         block[j] = (level[0] * qmul[j] + 32)>>6;
4264         for(i=1;i<total_coeff;i++) {
4265             if(zeros_left <= 0)
4266                 run_before = 0;
4267             else if(zeros_left < 7){
4268                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4269             }else{
4270                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4271             }
4272             zeros_left -= run_before;
4273             coeff_num -= 1 + run_before;
4274             j= scantable[ coeff_num ];
4275
4276             block[j]= (level[i] * qmul[j] + 32)>>6;
4277         }
4278     }
4279
4280     if(zeros_left<0){
4281         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4282         return -1;
4283     }
4284
4285     return 0;
4286 }
4287
4288 static void predict_field_decoding_flag(H264Context *h){
4289     MpegEncContext * const s = &h->s;
4290     const int mb_xy= h->mb_xy;
4291     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4292                 ? s->current_picture.mb_type[mb_xy-1]
4293                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4294                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4295                 : 0;
4296     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4297 }
4298
4299 /**
4300  * decodes a P_SKIP or B_SKIP macroblock
4301  */
4302 static void decode_mb_skip(H264Context *h){
4303     MpegEncContext * const s = &h->s;
4304     const int mb_xy= h->mb_xy;
4305     int mb_type=0;
4306
4307     memset(h->non_zero_count[mb_xy], 0, 16);
4308     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4309
4310     if(MB_FIELD)
4311         mb_type|= MB_TYPE_INTERLACED;
4312
4313     if( h->slice_type_nos == FF_B_TYPE )
4314     {
4315         // just for fill_caches. pred_direct_motion will set the real mb_type
4316         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4317
4318         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4319         pred_direct_motion(h, &mb_type);
4320         mb_type|= MB_TYPE_SKIP;
4321     }
4322     else
4323     {
4324         int mx, my;
4325         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4326
4327         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4328         pred_pskip_motion(h, &mx, &my);
4329         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4330         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4331     }
4332
4333     write_back_motion(h, mb_type);
4334     s->current_picture.mb_type[mb_xy]= mb_type;
4335     s->current_picture.qscale_table[mb_xy]= s->qscale;
4336     h->slice_table[ mb_xy ]= h->slice_num;
4337     h->prev_mb_skipped= 1;
4338 }
4339
4340 /**
4341  * decodes a macroblock
4342  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4343  */
4344 static int decode_mb_cavlc(H264Context *h){
4345     MpegEncContext * const s = &h->s;
4346     int mb_xy;
4347     int partition_count;
4348     unsigned int mb_type, cbp;
4349     int dct8x8_allowed= h->pps.transform_8x8_mode;
4350
4351     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4352
4353     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4354     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4355                 down the code */
4356     if(h->slice_type_nos != FF_I_TYPE){
4357         if(s->mb_skip_run==-1)
4358             s->mb_skip_run= get_ue_golomb(&s->gb);
4359
4360         if (s->mb_skip_run--) {
4361             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4362                 if(s->mb_skip_run==0)
4363                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4364                 else
4365                     predict_field_decoding_flag(h);
4366             }
4367             decode_mb_skip(h);
4368             return 0;
4369         }
4370     }
4371     if(FRAME_MBAFF){
4372         if( (s->mb_y&1) == 0 )
4373             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4374     }
4375
4376     h->prev_mb_skipped= 0;
4377
4378     mb_type= get_ue_golomb(&s->gb);
4379     if(h->slice_type_nos == FF_B_TYPE){
4380         if(mb_type < 23){
4381             partition_count= b_mb_type_info[mb_type].partition_count;
4382             mb_type=         b_mb_type_info[mb_type].type;
4383         }else{
4384             mb_type -= 23;
4385             goto decode_intra_mb;
4386         }
4387     }else if(h->slice_type_nos == FF_P_TYPE){
4388         if(mb_type < 5){
4389             partition_count= p_mb_type_info[mb_type].partition_count;
4390             mb_type=         p_mb_type_info[mb_type].type;
4391         }else{
4392             mb_type -= 5;
4393             goto decode_intra_mb;
4394         }
4395     }else{
4396        assert(h->slice_type_nos == FF_I_TYPE);
4397         if(h->slice_type == FF_SI_TYPE && mb_type)
4398             mb_type--;
4399 decode_intra_mb:
4400         if(mb_type > 25){
4401             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4402             return -1;
4403         }
4404         partition_count=0;
4405         cbp= i_mb_type_info[mb_type].cbp;
4406         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4407         mb_type= i_mb_type_info[mb_type].type;
4408     }
4409
4410     if(MB_FIELD)
4411         mb_type |= MB_TYPE_INTERLACED;
4412
4413     h->slice_table[ mb_xy ]= h->slice_num;
4414
4415     if(IS_INTRA_PCM(mb_type)){
4416         unsigned int x;
4417
4418         // We assume these blocks are very rare so we do not optimize it.
4419         align_get_bits(&s->gb);
4420
4421         // The pixels are stored in the same order as levels in h->mb array.
4422         for(x=0; x < (CHROMA ? 384 : 256); x++){
4423             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4424         }
4425
4426         // In deblocking, the quantizer is 0
4427         s->current_picture.qscale_table[mb_xy]= 0;
4428         // All coeffs are present
4429         memset(h->non_zero_count[mb_xy], 16, 16);
4430
4431         s->current_picture.mb_type[mb_xy]= mb_type;
4432         return 0;
4433     }
4434
4435     if(MB_MBAFF){
4436         h->ref_count[0] <<= 1;
4437         h->ref_count[1] <<= 1;
4438     }
4439
4440     fill_caches(h, mb_type, 0);
4441
4442     //mb_pred
4443     if(IS_INTRA(mb_type)){
4444         int pred_mode;
4445 //            init_top_left_availability(h);
4446         if(IS_INTRA4x4(mb_type)){
4447             int i;
4448             int di = 1;
4449             if(dct8x8_allowed && get_bits1(&s->gb)){
4450                 mb_type |= MB_TYPE_8x8DCT;
4451                 di = 4;
4452             }
4453
4454 //                fill_intra4x4_pred_table(h);
4455             for(i=0; i<16; i+=di){
4456                 int mode= pred_intra_mode(h, i);
4457
4458                 if(!get_bits1(&s->gb)){
4459                     const int rem_mode= get_bits(&s->gb, 3);
4460                     mode = rem_mode + (rem_mode >= mode);
4461                 }
4462
4463                 if(di==4)
4464                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4465                 else
4466                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4467             }
4468             write_back_intra_pred_mode(h);
4469             if( check_intra4x4_pred_mode(h) < 0)
4470                 return -1;
4471         }else{
4472             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4473             if(h->intra16x16_pred_mode < 0)
4474                 return -1;
4475         }
4476         if(CHROMA){
4477             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4478             if(pred_mode < 0)
4479                 return -1;
4480             h->chroma_pred_mode= pred_mode;
4481         }
4482     }else if(partition_count==4){
4483         int i, j, sub_partition_count[4], list, ref[2][4];
4484
4485         if(h->slice_type_nos == FF_B_TYPE){
4486             for(i=0; i<4; i++){
4487                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4488                 if(h->sub_mb_type[i] >=13){
4489                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4490                     return -1;
4491                 }
4492                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4493                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4494             }
4495             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4496                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4497                 pred_direct_motion(h, &mb_type);
4498                 h->ref_cache[0][scan8[4]] =
4499                 h->ref_cache[1][scan8[4]] =
4500                 h->ref_cache[0][scan8[12]] =
4501                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4502             }
4503         }else{
4504             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4505             for(i=0; i<4; i++){
4506                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4507                 if(h->sub_mb_type[i] >=4){
4508                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4509                     return -1;
4510                 }
4511                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4512                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4513             }
4514         }
4515
4516         for(list=0; list<h->list_count; list++){
4517             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4518             for(i=0; i<4; i++){
4519                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4520                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4521                     unsigned int tmp;
4522                     if(ref_count == 1){
4523                         tmp= 0;
4524                     }else if(ref_count == 2){
4525                         tmp= get_bits1(&s->gb)^1;
4526                     }else{
4527                         tmp= get_ue_golomb_31(&s->gb);
4528                         if(tmp>=ref_count){
4529                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4530                             return -1;
4531                         }
4532                     }
4533                     ref[list][i]= tmp;
4534                 }else{
4535                  //FIXME
4536                     ref[list][i] = -1;
4537                 }
4538             }
4539         }
4540
4541         if(dct8x8_allowed)
4542             dct8x8_allowed = get_dct8x8_allowed(h);
4543
4544         for(list=0; list<h->list_count; list++){
4545             for(i=0; i<4; i++){
4546                 if(IS_DIRECT(h->sub_mb_type[i])) {
4547                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4548                     continue;
4549                 }
4550                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4551                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4552
4553                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4554                     const int sub_mb_type= h->sub_mb_type[i];
4555                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4556                     for(j=0; j<sub_partition_count[i]; j++){
4557                         int mx, my;
4558                         const int index= 4*i + block_width*j;
4559                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4560                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4561                         mx += get_se_golomb(&s->gb);
4562                         my += get_se_golomb(&s->gb);
4563                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4564
4565                         if(IS_SUB_8X8(sub_mb_type)){
4566                             mv_cache[ 1 ][0]=
4567                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4568                             mv_cache[ 1 ][1]=
4569                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4570                         }else if(IS_SUB_8X4(sub_mb_type)){
4571                             mv_cache[ 1 ][0]= mx;
4572                             mv_cache[ 1 ][1]= my;
4573                         }else if(IS_SUB_4X8(sub_mb_type)){
4574                             mv_cache[ 8 ][0]= mx;
4575                             mv_cache[ 8 ][1]= my;
4576                         }
4577                         mv_cache[ 0 ][0]= mx;
4578                         mv_cache[ 0 ][1]= my;
4579                     }
4580                 }else{
4581                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4582                     p[0] = p[1]=
4583                     p[8] = p[9]= 0;
4584                 }
4585             }
4586         }
4587     }else if(IS_DIRECT(mb_type)){
4588         pred_direct_motion(h, &mb_type);
4589         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4590     }else{
4591         int list, mx, my, i;
4592          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4593         if(IS_16X16(mb_type)){
4594             for(list=0; list<h->list_count; list++){
4595                     unsigned int val;
4596                     if(IS_DIR(mb_type, 0, list)){
4597                         if(h->ref_count[list]==1){
4598                             val= 0;
4599                         }else if(h->ref_count[list]==2){
4600                             val= get_bits1(&s->gb)^1;
4601                         }else{
4602                             val= get_ue_golomb_31(&s->gb);
4603                             if(val >= h->ref_count[list]){
4604                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4605                                 return -1;
4606                             }
4607                         }
4608                     }else
4609                         val= LIST_NOT_USED&0xFF;
4610                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4611             }
4612             for(list=0; list<h->list_count; list++){
4613                 unsigned int val;
4614                 if(IS_DIR(mb_type, 0, list)){
4615                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4616                     mx += get_se_golomb(&s->gb);
4617                     my += get_se_golomb(&s->gb);
4618                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4619
4620                     val= pack16to32(mx,my);
4621                 }else
4622                     val=0;
4623                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4624             }
4625         }
4626         else if(IS_16X8(mb_type)){
4627             for(list=0; list<h->list_count; list++){
4628                     for(i=0; i<2; i++){
4629                         unsigned int val;
4630                         if(IS_DIR(mb_type, i, list)){
4631                             if(h->ref_count[list] == 1){
4632                                 val= 0;
4633                             }else if(h->ref_count[list] == 2){
4634                                 val= get_bits1(&s->gb)^1;
4635                             }else{
4636                                 val= get_ue_golomb_31(&s->gb);
4637                                 if(val >= h->ref_count[list]){
4638                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4639                                     return -1;
4640                                 }
4641                             }
4642                         }else
4643                             val= LIST_NOT_USED&0xFF;
4644                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4645                     }
4646             }
4647             for(list=0; list<h->list_count; list++){
4648                 for(i=0; i<2; i++){
4649                     unsigned int val;
4650                     if(IS_DIR(mb_type, i, list)){
4651                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4652                         mx += get_se_golomb(&s->gb);
4653                         my += get_se_golomb(&s->gb);
4654                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4655
4656                         val= pack16to32(mx,my);
4657                     }else
4658                         val=0;
4659                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4660                 }
4661             }
4662         }else{
4663             assert(IS_8X16(mb_type));
4664             for(list=0; list<h->list_count; list++){
4665                     for(i=0; i<2; i++){
4666                         unsigned int val;
4667                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4668                             if(h->ref_count[list]==1){
4669                                 val= 0;
4670                             }else if(h->ref_count[list]==2){
4671                                 val= get_bits1(&s->gb)^1;
4672                             }else{
4673                                 val= get_ue_golomb_31(&s->gb);
4674                                 if(val >= h->ref_count[list]){
4675                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4676                                     return -1;
4677                                 }
4678                             }
4679                         }else
4680                             val= LIST_NOT_USED&0xFF;
4681                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4682                     }
4683             }
4684             for(list=0; list<h->list_count; list++){
4685                 for(i=0; i<2; i++){
4686                     unsigned int val;
4687                     if(IS_DIR(mb_type, i, list)){
4688                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4689                         mx += get_se_golomb(&s->gb);
4690                         my += get_se_golomb(&s->gb);
4691                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4692
4693                         val= pack16to32(mx,my);
4694                     }else
4695                         val=0;
4696                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4697                 }
4698             }
4699         }
4700     }
4701
4702     if(IS_INTER(mb_type))
4703         write_back_motion(h, mb_type);
4704
4705     if(!IS_INTRA16x16(mb_type)){
4706         cbp= get_ue_golomb(&s->gb);
4707         if(cbp > 47){
4708             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4709             return -1;
4710         }
4711
4712         if(CHROMA){
4713             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4714             else                     cbp= golomb_to_inter_cbp   [cbp];
4715         }else{
4716             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4717             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4718         }
4719     }
4720     h->cbp = cbp;
4721
4722     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4723         if(get_bits1(&s->gb)){
4724             mb_type |= MB_TYPE_8x8DCT;
4725             h->cbp_table[mb_xy]= cbp;
4726         }
4727     }
4728     s->current_picture.mb_type[mb_xy]= mb_type;
4729
4730     if(cbp || IS_INTRA16x16(mb_type)){
4731         int i8x8, i4x4, chroma_idx;
4732         int dquant;
4733         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4734         const uint8_t *scan, *scan8x8, *dc_scan;
4735
4736 //        fill_non_zero_count_cache(h);
4737
4738         if(IS_INTERLACED(mb_type)){
4739             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4740             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4741             dc_scan= luma_dc_field_scan;
4742         }else{
4743             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4744             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4745             dc_scan= luma_dc_zigzag_scan;
4746         }
4747
4748         dquant= get_se_golomb(&s->gb);
4749
4750         if( dquant > 25 || dquant < -26 ){
4751             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4752             return -1;
4753         }
4754
4755         s->qscale += dquant;
4756         if(((unsigned)s->qscale) > 51){
4757             if(s->qscale<0) s->qscale+= 52;
4758             else            s->qscale-= 52;
4759         }
4760
4761         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4762         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4763         if(IS_INTRA16x16(mb_type)){
4764             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4765                 return -1; //FIXME continue if partitioned and other return -1 too
4766             }
4767
4768             assert((cbp&15) == 0 || (cbp&15) == 15);
4769
4770             if(cbp&15){
4771                 for(i8x8=0; i8x8<4; i8x8++){
4772                     for(i4x4=0; i4x4<4; i4x4++){
4773                         const int index= i4x4 + 4*i8x8;
4774                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4775                             return -1;
4776                         }
4777                     }
4778                 }
4779             }else{
4780                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4781             }
4782         }else{
4783             for(i8x8=0; i8x8<4; i8x8++){
4784                 if(cbp & (1<<i8x8)){
4785                     if(IS_8x8DCT(mb_type)){
4786                         DCTELEM *buf = &h->mb[64*i8x8];
4787                         uint8_t *nnz;
4788                         for(i4x4=0; i4x4<4; i4x4++){
4789                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4790                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4791                                 return -1;
4792                         }
4793                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4794                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4795                     }else{
4796                         for(i4x4=0; i4x4<4; i4x4++){
4797                             const int index= i4x4 + 4*i8x8;
4798
4799                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4800                                 return -1;
4801                             }
4802                         }
4803                     }
4804                 }else{
4805                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4806                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4807                 }
4808             }
4809         }
4810
4811         if(cbp&0x30){
4812             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4813                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4814                     return -1;
4815                 }
4816         }
4817
4818         if(cbp&0x20){
4819             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4820                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4821                 for(i4x4=0; i4x4<4; i4x4++){
4822                     const int index= 16 + 4*chroma_idx + i4x4;
4823                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4824                         return -1;
4825                     }
4826                 }
4827             }
4828         }else{
4829             uint8_t * const nnz= &h->non_zero_count_cache[0];
4830             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4831             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4832         }
4833     }else{
4834         uint8_t * const nnz= &h->non_zero_count_cache[0];
4835         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4836         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4837         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4838     }
4839     s->current_picture.qscale_table[mb_xy]= s->qscale;
4840     write_back_non_zero_count(h);
4841
4842     if(MB_MBAFF){
4843         h->ref_count[0] >>= 1;
4844         h->ref_count[1] >>= 1;
4845     }
4846
4847     return 0;
4848 }
4849
4850 static int decode_cabac_field_decoding_flag(H264Context *h) {
4851     MpegEncContext * const s = &h->s;
4852     const int mb_x = s->mb_x;
4853     const int mb_y = s->mb_y & ~1;
4854     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4855     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4856
4857     unsigned int ctx = 0;
4858
4859     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4860         ctx += 1;
4861     }
4862     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4863         ctx += 1;
4864     }
4865
4866     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4867 }
4868
4869 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4870     uint8_t *state= &h->cabac_state[ctx_base];
4871     int mb_type;
4872
4873     if(intra_slice){
4874         MpegEncContext * const s = &h->s;
4875         const int mba_xy = h->left_mb_xy[0];
4876         const int mbb_xy = h->top_mb_xy;
4877         int ctx=0;
4878         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4879             ctx++;
4880         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4881             ctx++;
4882         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4883             return 0;   /* I4x4 */
4884         state += 2;
4885     }else{
4886         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4887             return 0;   /* I4x4 */
4888     }
4889
4890     if( get_cabac_terminate( &h->cabac ) )
4891         return 25;  /* PCM */
4892
4893     mb_type = 1; /* I16x16 */
4894     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4895     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4896         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4897     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4898     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4899     return mb_type;
4900 }
4901
4902 static int decode_cabac_mb_type_b( H264Context *h ) {
4903     MpegEncContext * const s = &h->s;
4904
4905         const int mba_xy = h->left_mb_xy[0];
4906         const int mbb_xy = h->top_mb_xy;
4907         int ctx = 0;
4908         int bits;
4909         assert(h->slice_type_nos == FF_B_TYPE);
4910
4911         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4912             ctx++;
4913         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4914             ctx++;
4915
4916         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4917             return 0; /* B_Direct_16x16 */
4918
4919         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4920             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4921         }
4922
4923         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4924         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4925         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4926         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4927         if( bits < 8 )
4928             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4929         else if( bits == 13 ) {
4930             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4931         } else if( bits == 14 )
4932             return 11; /* B_L1_L0_8x16 */
4933         else if( bits == 15 )
4934             return 22; /* B_8x8 */
4935
4936         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4937         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4938 }
4939
4940 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4941     MpegEncContext * const s = &h->s;
4942     int mba_xy, mbb_xy;
4943     int ctx = 0;
4944
4945     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4946         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4947         mba_xy = mb_xy - 1;
4948         if( (mb_y&1)
4949             && h->slice_table[mba_xy] == h->slice_num
4950             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4951             mba_xy += s->mb_stride;
4952         if( MB_FIELD ){
4953             mbb_xy = mb_xy - s->mb_stride;
4954             if( !(mb_y&1)
4955                 && h->slice_table[mbb_xy] == h->slice_num
4956                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4957                 mbb_xy -= s->mb_stride;
4958         }else
4959             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4960     }else{
4961         int mb_xy = h->mb_xy;
4962         mba_xy = mb_xy - 1;
4963         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4964     }
4965
4966     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4967         ctx++;
4968     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4969         ctx++;
4970
4971     if( h->slice_type_nos == FF_B_TYPE )
4972         ctx += 13;
4973     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4974 }
4975
4976 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4977     int mode = 0;
4978
4979     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4980         return pred_mode;
4981
4982     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4983     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4984     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4985
4986     if( mode >= pred_mode )
4987         return mode + 1;
4988     else
4989         return mode;
4990 }
4991
4992 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4993     const int mba_xy = h->left_mb_xy[0];
4994     const int mbb_xy = h->top_mb_xy;
4995
4996     int ctx = 0;
4997
4998     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4999     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5000         ctx++;
5001
5002     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5003         ctx++;
5004
5005     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5006         return 0;
5007
5008     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5009         return 1;
5010     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5011         return 2;
5012     else
5013         return 3;
5014 }
5015
5016 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5017     int cbp_b, cbp_a, ctx, cbp = 0;
5018
5019     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5020     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5021
5022     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5023     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5024     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5025     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5026     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5027     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5028     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5029     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5030     return cbp;
5031 }
5032 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5033     int ctx;
5034     int cbp_a, cbp_b;
5035
5036     cbp_a = (h->left_cbp>>4)&0x03;
5037     cbp_b = (h-> top_cbp>>4)&0x03;
5038
5039     ctx = 0;
5040     if( cbp_a > 0 ) ctx++;
5041     if( cbp_b > 0 ) ctx += 2;
5042     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5043         return 0;
5044
5045     ctx = 4;
5046     if( cbp_a == 2 ) ctx++;
5047     if( cbp_b == 2 ) ctx += 2;
5048     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5049 }
5050 static int decode_cabac_mb_dqp( H264Context *h) {
5051     int   ctx= h->last_qscale_diff != 0;
5052     int   val = 0;
5053
5054     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5055         ctx= 2+(ctx>>1);
5056         val++;
5057         if(val > 102) //prevent infinite loop
5058             return INT_MIN;
5059     }
5060
5061     if( val&0x01 )
5062         return   (val + 1)>>1 ;
5063     else
5064         return -((val + 1)>>1);
5065 }
5066 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5067     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5068         return 0;   /* 8x8 */
5069     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5070         return 1;   /* 8x4 */
5071     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5072         return 2;   /* 4x8 */
5073     return 3;       /* 4x4 */
5074 }
5075 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5076     int type;
5077     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5078         return 0;   /* B_Direct_8x8 */
5079     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5080         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5081     type = 3;
5082     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5083         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5084             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5085         type += 4;
5086     }
5087     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5088     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5089     return type;
5090 }
5091
5092 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5093     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5094 }
5095
5096 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5097     int refa = h->ref_cache[list][scan8[n] - 1];
5098     int refb = h->ref_cache[list][scan8[n] - 8];
5099     int ref  = 0;
5100     int ctx  = 0;
5101
5102     if( h->slice_type_nos == FF_B_TYPE) {
5103         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5104             ctx++;
5105         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5106             ctx += 2;
5107     } else {
5108         if( refa > 0 )
5109             ctx++;
5110         if( refb > 0 )
5111             ctx += 2;
5112     }
5113
5114     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5115         ref++;
5116         ctx = (ctx>>2)+4;
5117         if(ref >= 32 /*h->ref_list[list]*/){
5118             return -1;
5119         }
5120     }
5121     return ref;
5122 }
5123
5124 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5125     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5126                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5127     int ctxbase = (l == 0) ? 40 : 47;
5128     int mvd;
5129     int ctx = (amvd>2) + (amvd>32);
5130
5131     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5132         return 0;
5133
5134     mvd= 1;
5135     ctx= 3;
5136     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5137         mvd++;
5138         if( ctx < 6 )
5139             ctx++;
5140     }
5141
5142     if( mvd >= 9 ) {
5143         int k = 3;
5144         while( get_cabac_bypass( &h->cabac ) ) {
5145             mvd += 1 << k;
5146             k++;
5147             if(k>24){
5148                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5149                 return INT_MIN;
5150             }
5151         }
5152         while( k-- ) {
5153             if( get_cabac_bypass( &h->cabac ) )
5154                 mvd += 1 << k;
5155         }
5156     }
5157     return get_cabac_bypass_sign( &h->cabac, -mvd );
5158 }
5159
5160 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5161     int nza, nzb;
5162     int ctx = 0;
5163
5164     if( is_dc ) {
5165         if( cat == 0 ) {
5166             nza = h->left_cbp&0x100;
5167             nzb = h-> top_cbp&0x100;
5168         } else {
5169             nza = (h->left_cbp>>(6+idx))&0x01;
5170             nzb = (h-> top_cbp>>(6+idx))&0x01;
5171         }
5172     } else {
5173         assert(cat == 1 || cat == 2 || cat == 4);
5174         nza = h->non_zero_count_cache[scan8[idx] - 1];
5175         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5176     }
5177
5178     if( nza > 0 )
5179         ctx++;
5180
5181     if( nzb > 0 )
5182         ctx += 2;
5183
5184     return ctx + 4 * cat;
5185 }
5186
5187 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5188     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5189     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5190     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5191     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5192 };
5193
5194 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5195     static const int significant_coeff_flag_offset[2][6] = {
5196       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5197       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5198     };
5199     static const int last_coeff_flag_offset[2][6] = {
5200       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5201       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5202     };
5203     static const int coeff_abs_level_m1_offset[6] = {
5204         227+0, 227+10, 227+20, 227+30, 227+39, 426
5205     };
5206     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5207       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5208         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5209         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5210        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5211       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5212         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5213         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5214         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5215     };
5216     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5217      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5218      * map node ctx => cabac ctx for level=1 */
5219     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5220     /* map node ctx => cabac ctx for level>1 */
5221     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5222     static const uint8_t coeff_abs_level_transition[2][8] = {
5223     /* update node ctx after decoding a level=1 */
5224         { 1, 2, 3, 3, 4, 5, 6, 7 },
5225     /* update node ctx after decoding a level>1 */
5226         { 4, 4, 4, 4, 5, 6, 7, 7 }
5227     };
5228
5229     int index[64];
5230
5231     int av_unused last;
5232     int coeff_count = 0;
5233     int node_ctx = 0;
5234
5235     uint8_t *significant_coeff_ctx_base;
5236     uint8_t *last_coeff_ctx_base;
5237     uint8_t *abs_level_m1_ctx_base;
5238
5239 #if !ARCH_X86
5240 #define CABAC_ON_STACK
5241 #endif
5242 #ifdef CABAC_ON_STACK
5243 #define CC &cc
5244     CABACContext cc;
5245     cc.range     = h->cabac.range;
5246     cc.low       = h->cabac.low;
5247     cc.bytestream= h->cabac.bytestream;
5248 #else
5249 #define CC &h->cabac
5250 #endif
5251
5252
5253     /* cat: 0-> DC 16x16  n = 0
5254      *      1-> AC 16x16  n = luma4x4idx
5255      *      2-> Luma4x4   n = luma4x4idx
5256      *      3-> DC Chroma n = iCbCr
5257      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5258      *      5-> Luma8x8   n = 4 * luma8x8idx
5259      */
5260
5261     /* read coded block flag */
5262     if( is_dc || cat != 5 ) {
5263         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5264             if( !is_dc )
5265                 h->non_zero_count_cache[scan8[n]] = 0;
5266
5267 #ifdef CABAC_ON_STACK
5268             h->cabac.range     = cc.range     ;
5269             h->cabac.low       = cc.low       ;
5270             h->cabac.bytestream= cc.bytestream;
5271 #endif
5272             return;
5273         }
5274     }
5275
5276     significant_coeff_ctx_base = h->cabac_state
5277         + significant_coeff_flag_offset[MB_FIELD][cat];
5278     last_coeff_ctx_base = h->cabac_state
5279         + last_coeff_flag_offset[MB_FIELD][cat];
5280     abs_level_m1_ctx_base = h->cabac_state
5281         + coeff_abs_level_m1_offset[cat];
5282
5283     if( !is_dc && cat == 5 ) {
5284 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5285         for(last= 0; last < coefs; last++) { \
5286             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5287             if( get_cabac( CC, sig_ctx )) { \
5288                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5289                 index[coeff_count++] = last; \
5290                 if( get_cabac( CC, last_ctx ) ) { \
5291                     last= max_coeff; \
5292                     break; \
5293                 } \
5294             } \
5295         }\
5296         if( last == max_coeff -1 ) {\
5297             index[coeff_count++] = last;\
5298         }
5299         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5300 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5301         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5302     } else {
5303         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5304 #else
5305         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5306     } else {
5307         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5308 #endif
5309     }
5310     assert(coeff_count > 0);
5311
5312     if( is_dc ) {
5313         if( cat == 0 )
5314             h->cbp_table[h->mb_xy] |= 0x100;
5315         else
5316             h->cbp_table[h->mb_xy] |= 0x40 << n;
5317     } else {
5318         if( cat == 5 )
5319             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5320         else {
5321             assert( cat == 1 || cat == 2 || cat == 4 );
5322             h->non_zero_count_cache[scan8[n]] = coeff_count;
5323         }
5324     }
5325
5326     do {
5327         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5328
5329         int j= scantable[index[--coeff_count]];
5330
5331         if( get_cabac( CC, ctx ) == 0 ) {
5332             node_ctx = coeff_abs_level_transition[0][node_ctx];
5333             if( is_dc ) {
5334                 block[j] = get_cabac_bypass_sign( CC, -1);
5335             }else{
5336                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5337             }
5338         } else {
5339             int coeff_abs = 2;
5340             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5341             node_ctx = coeff_abs_level_transition[1][node_ctx];
5342
5343             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5344                 coeff_abs++;
5345             }
5346
5347             if( coeff_abs >= 15 ) {
5348                 int j = 0;
5349                 while( get_cabac_bypass( CC ) ) {
5350                     j++;
5351                 }
5352
5353                 coeff_abs=1;
5354                 while( j-- ) {
5355                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5356                 }
5357                 coeff_abs+= 14;
5358             }
5359
5360             if( is_dc ) {
5361                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5362             }else{
5363                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5364             }
5365         }
5366     } while( coeff_count );
5367 #ifdef CABAC_ON_STACK
5368             h->cabac.range     = cc.range     ;
5369             h->cabac.low       = cc.low       ;
5370             h->cabac.bytestream= cc.bytestream;
5371 #endif
5372
5373 }
5374
5375 #if !CONFIG_SMALL
5376 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5377     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5378 }
5379
5380 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5381     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5382 }
5383 #endif
5384
5385 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5386 #if CONFIG_SMALL
5387     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5388 #else
5389     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5390     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5391 #endif
5392 }
5393
5394 static inline void compute_mb_neighbors(H264Context *h)
5395 {
5396     MpegEncContext * const s = &h->s;
5397     const int mb_xy  = h->mb_xy;
5398     h->top_mb_xy     = mb_xy - s->mb_stride;
5399     h->left_mb_xy[0] = mb_xy - 1;
5400     if(FRAME_MBAFF){
5401         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5402         const int top_pair_xy      = pair_xy     - s->mb_stride;
5403         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5404         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5405         const int curr_mb_field_flag = MB_FIELD;
5406         const int bottom = (s->mb_y & 1);
5407
5408         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5409             h->top_mb_xy -= s->mb_stride;
5410         }
5411         if (!left_mb_field_flag == curr_mb_field_flag) {
5412             h->left_mb_xy[0] = pair_xy - 1;
5413         }
5414     } else if (FIELD_PICTURE) {
5415         h->top_mb_xy -= s->mb_stride;
5416     }
5417     return;
5418 }
5419
5420 /**
5421  * decodes a macroblock
5422  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5423  */
5424 static int decode_mb_cabac(H264Context *h) {
5425     MpegEncContext * const s = &h->s;
5426     int mb_xy;
5427     int mb_type, partition_count, cbp = 0;
5428     int dct8x8_allowed= h->pps.transform_8x8_mode;
5429
5430     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5431
5432     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5433     if( h->slice_type_nos != FF_I_TYPE ) {
5434         int skip;
5435         /* a skipped mb needs the aff flag from the following mb */
5436         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5437             predict_field_decoding_flag(h);
5438         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5439             skip = h->next_mb_skipped;
5440         else
5441             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5442         /* read skip flags */
5443         if( skip ) {
5444             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5445                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5446                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5447                 if(!h->next_mb_skipped)
5448                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5449             }
5450
5451             decode_mb_skip(h);
5452
5453             h->cbp_table[mb_xy] = 0;
5454             h->chroma_pred_mode_table[mb_xy] = 0;
5455             h->last_qscale_diff = 0;
5456
5457             return 0;
5458
5459         }
5460     }
5461     if(FRAME_MBAFF){
5462         if( (s->mb_y&1) == 0 )
5463             h->mb_mbaff =
5464             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5465     }
5466
5467     h->prev_mb_skipped = 0;
5468
5469     compute_mb_neighbors(h);
5470
5471     if( h->slice_type_nos == FF_B_TYPE ) {
5472         mb_type = decode_cabac_mb_type_b( h );
5473         if( mb_type < 23 ){
5474             partition_count= b_mb_type_info[mb_type].partition_count;
5475             mb_type=         b_mb_type_info[mb_type].type;
5476         }else{
5477             mb_type -= 23;
5478             goto decode_intra_mb;
5479         }
5480     } else if( h->slice_type_nos == FF_P_TYPE ) {
5481         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5482             /* P-type */
5483             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5484                 /* P_L0_D16x16, P_8x8 */
5485                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5486             } else {
5487                 /* P_L0_D8x16, P_L0_D16x8 */
5488                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5489             }
5490             partition_count= p_mb_type_info[mb_type].partition_count;
5491             mb_type=         p_mb_type_info[mb_type].type;
5492         } else {
5493             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5494             goto decode_intra_mb;
5495         }
5496     } else {
5497         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5498         if(h->slice_type == FF_SI_TYPE && mb_type)
5499             mb_type--;
5500         assert(h->slice_type_nos == FF_I_TYPE);
5501 decode_intra_mb:
5502         partition_count = 0;
5503         cbp= i_mb_type_info[mb_type].cbp;
5504         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5505         mb_type= i_mb_type_info[mb_type].type;
5506     }
5507     if(MB_FIELD)
5508         mb_type |= MB_TYPE_INTERLACED;
5509
5510     h->slice_table[ mb_xy ]= h->slice_num;
5511
5512     if(IS_INTRA_PCM(mb_type)) {
5513         const uint8_t *ptr;
5514
5515         // We assume these blocks are very rare so we do not optimize it.
5516         // FIXME The two following lines get the bitstream position in the cabac
5517         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5518         ptr= h->cabac.bytestream;
5519         if(h->cabac.low&0x1) ptr--;
5520         if(CABAC_BITS==16){
5521             if(h->cabac.low&0x1FF) ptr--;
5522         }
5523
5524         // The pixels are stored in the same order as levels in h->mb array.
5525         memcpy(h->mb, ptr, 256); ptr+=256;
5526         if(CHROMA){
5527             memcpy(h->mb+128, ptr, 128); ptr+=128;
5528         }
5529
5530         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5531
5532         // All blocks are present
5533         h->cbp_table[mb_xy] = 0x1ef;
5534         h->chroma_pred_mode_table[mb_xy] = 0;
5535         // In deblocking, the quantizer is 0
5536         s->current_picture.qscale_table[mb_xy]= 0;
5537         // All coeffs are present
5538         memset(h->non_zero_count[mb_xy], 16, 16);
5539         s->current_picture.mb_type[mb_xy]= mb_type;
5540         h->last_qscale_diff = 0;
5541         return 0;
5542     }
5543
5544     if(MB_MBAFF){
5545         h->ref_count[0] <<= 1;
5546         h->ref_count[1] <<= 1;
5547     }
5548
5549     fill_caches(h, mb_type, 0);
5550
5551     if( IS_INTRA( mb_type ) ) {
5552         int i, pred_mode;
5553         if( IS_INTRA4x4( mb_type ) ) {
5554             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5555                 mb_type |= MB_TYPE_8x8DCT;
5556                 for( i = 0; i < 16; i+=4 ) {
5557                     int pred = pred_intra_mode( h, i );
5558                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5559                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5560                 }
5561             } else {
5562                 for( i = 0; i < 16; i++ ) {
5563                     int pred = pred_intra_mode( h, i );
5564                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5565
5566                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5567                 }
5568             }
5569             write_back_intra_pred_mode(h);
5570             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5571         } else {
5572             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5573             if( h->intra16x16_pred_mode < 0 ) return -1;
5574         }
5575         if(CHROMA){
5576             h->chroma_pred_mode_table[mb_xy] =
5577             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5578
5579             pred_mode= check_intra_pred_mode( h, pred_mode );
5580             if( pred_mode < 0 ) return -1;
5581             h->chroma_pred_mode= pred_mode;
5582         }
5583     } else if( partition_count == 4 ) {
5584         int i, j, sub_partition_count[4], list, ref[2][4];
5585
5586         if( h->slice_type_nos == FF_B_TYPE ) {
5587             for( i = 0; i < 4; i++ ) {
5588                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5589                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5590                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5591             }
5592             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5593                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5594                 pred_direct_motion(h, &mb_type);
5595                 h->ref_cache[0][scan8[4]] =
5596                 h->ref_cache[1][scan8[4]] =
5597                 h->ref_cache[0][scan8[12]] =
5598                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5599                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5600                     for( i = 0; i < 4; i++ )
5601                         if( IS_DIRECT(h->sub_mb_type[i]) )
5602                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5603                 }
5604             }
5605         } else {
5606             for( i = 0; i < 4; i++ ) {
5607                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5608                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5609                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5610             }
5611         }
5612
5613         for( list = 0; list < h->list_count; list++ ) {
5614                 for( i = 0; i < 4; i++ ) {
5615                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5616                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5617                         if( h->ref_count[list] > 1 ){
5618                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5619                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5620                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5621                                 return -1;
5622                             }
5623                         }else
5624                             ref[list][i] = 0;
5625                     } else {
5626                         ref[list][i] = -1;
5627                     }
5628                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5629                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5630                 }
5631         }
5632
5633         if(dct8x8_allowed)
5634             dct8x8_allowed = get_dct8x8_allowed(h);
5635
5636         for(list=0; list<h->list_count; list++){
5637             for(i=0; i<4; i++){
5638                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5639                 if(IS_DIRECT(h->sub_mb_type[i])){
5640                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5641                     continue;
5642                 }
5643
5644                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5645                     const int sub_mb_type= h->sub_mb_type[i];
5646                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5647                     for(j=0; j<sub_partition_count[i]; j++){
5648                         int mpx, mpy;
5649                         int mx, my;
5650                         const int index= 4*i + block_width*j;
5651                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5652                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5653                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5654
5655                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5656                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5657                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5658
5659                         if(IS_SUB_8X8(sub_mb_type)){
5660                             mv_cache[ 1 ][0]=
5661                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5662                             mv_cache[ 1 ][1]=
5663                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5664
5665                             mvd_cache[ 1 ][0]=
5666                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5667                             mvd_cache[ 1 ][1]=
5668                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5669                         }else if(IS_SUB_8X4(sub_mb_type)){
5670                             mv_cache[ 1 ][0]= mx;
5671                             mv_cache[ 1 ][1]= my;
5672
5673                             mvd_cache[ 1 ][0]= mx - mpx;
5674                             mvd_cache[ 1 ][1]= my - mpy;
5675                         }else if(IS_SUB_4X8(sub_mb_type)){
5676                             mv_cache[ 8 ][0]= mx;
5677                             mv_cache[ 8 ][1]= my;
5678
5679                             mvd_cache[ 8 ][0]= mx - mpx;
5680                             mvd_cache[ 8 ][1]= my - mpy;
5681                         }
5682                         mv_cache[ 0 ][0]= mx;
5683                         mv_cache[ 0 ][1]= my;
5684
5685                         mvd_cache[ 0 ][0]= mx - mpx;
5686                         mvd_cache[ 0 ][1]= my - mpy;
5687                     }
5688                 }else{
5689                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5690                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5691                     p[0] = p[1] = p[8] = p[9] = 0;
5692                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5693                 }
5694             }
5695         }
5696     } else if( IS_DIRECT(mb_type) ) {
5697         pred_direct_motion(h, &mb_type);
5698         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5699         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5700         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5701     } else {
5702         int list, mx, my, i, mpx, mpy;
5703         if(IS_16X16(mb_type)){
5704             for(list=0; list<h->list_count; list++){
5705                 if(IS_DIR(mb_type, 0, list)){
5706                     int ref;
5707                     if(h->ref_count[list] > 1){
5708                         ref= decode_cabac_mb_ref(h, list, 0);
5709                         if(ref >= (unsigned)h->ref_count[list]){
5710                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5711                             return -1;
5712                         }
5713                     }else
5714                         ref=0;
5715                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5716                 }else
5717                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5718             }
5719             for(list=0; list<h->list_count; list++){
5720                 if(IS_DIR(mb_type, 0, list)){
5721                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5722
5723                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5724                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5725                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5726
5727                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5728                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5729                 }else
5730                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5731             }
5732         }
5733         else if(IS_16X8(mb_type)){
5734             for(list=0; list<h->list_count; list++){
5735                     for(i=0; i<2; i++){
5736                         if(IS_DIR(mb_type, i, list)){
5737                             int ref;
5738                             if(h->ref_count[list] > 1){
5739                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5740                                 if(ref >= (unsigned)h->ref_count[list]){
5741                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5742                                     return -1;
5743                                 }
5744                             }else
5745                                 ref=0;
5746                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5747                         }else
5748                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5749                     }
5750             }
5751             for(list=0; list<h->list_count; list++){
5752                 for(i=0; i<2; i++){
5753                     if(IS_DIR(mb_type, i, list)){
5754                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5755                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5756                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5757                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5758
5759                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5760                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5761                     }else{
5762                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5763                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5764                     }
5765                 }
5766             }
5767         }else{
5768             assert(IS_8X16(mb_type));
5769             for(list=0; list<h->list_count; list++){
5770                     for(i=0; i<2; i++){
5771                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5772                             int ref;
5773                             if(h->ref_count[list] > 1){
5774                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5775                                 if(ref >= (unsigned)h->ref_count[list]){
5776                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5777                                     return -1;
5778                                 }
5779                             }else
5780                                 ref=0;
5781                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5782                         }else
5783                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5784                     }
5785             }
5786             for(list=0; list<h->list_count; list++){
5787                 for(i=0; i<2; i++){
5788                     if(IS_DIR(mb_type, i, list)){
5789                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5790                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5791                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5792
5793                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5794                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5795                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5796                     }else{
5797                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5798                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5799                     }
5800                 }
5801             }
5802         }
5803     }
5804
5805    if( IS_INTER( mb_type ) ) {
5806         h->chroma_pred_mode_table[mb_xy] = 0;
5807         write_back_motion( h, mb_type );
5808    }
5809
5810     if( !IS_INTRA16x16( mb_type ) ) {
5811         cbp  = decode_cabac_mb_cbp_luma( h );
5812         if(CHROMA)
5813             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5814     }
5815
5816     h->cbp_table[mb_xy] = h->cbp = cbp;
5817
5818     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5819         if( decode_cabac_mb_transform_size( h ) )
5820             mb_type |= MB_TYPE_8x8DCT;
5821     }
5822     s->current_picture.mb_type[mb_xy]= mb_type;
5823
5824     if( cbp || IS_INTRA16x16( mb_type ) ) {
5825         const uint8_t *scan, *scan8x8, *dc_scan;
5826         const uint32_t *qmul;
5827         int dqp;
5828
5829         if(IS_INTERLACED(mb_type)){
5830             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5831             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5832             dc_scan= luma_dc_field_scan;
5833         }else{
5834             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5835             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5836             dc_scan= luma_dc_zigzag_scan;
5837         }
5838
5839         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5840         if( dqp == INT_MIN ){
5841             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5842             return -1;
5843         }
5844         s->qscale += dqp;
5845         if(((unsigned)s->qscale) > 51){
5846             if(s->qscale<0) s->qscale+= 52;
5847             else            s->qscale-= 52;
5848         }
5849         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5850         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5851
5852         if( IS_INTRA16x16( mb_type ) ) {
5853             int i;
5854             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5855             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5856
5857             if( cbp&15 ) {
5858                 qmul = h->dequant4_coeff[0][s->qscale];
5859                 for( i = 0; i < 16; i++ ) {
5860                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5861                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5862                 }
5863             } else {
5864                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5865             }
5866         } else {
5867             int i8x8, i4x4;
5868             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5869                 if( cbp & (1<<i8x8) ) {
5870                     if( IS_8x8DCT(mb_type) ) {
5871                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5872                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5873                     } else {
5874                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5875                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5876                             const int index = 4*i8x8 + i4x4;
5877                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5878 //START_TIMER
5879                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5880 //STOP_TIMER("decode_residual")
5881                         }
5882                     }
5883                 } else {
5884                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5885                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5886                 }
5887             }
5888         }
5889
5890         if( cbp&0x30 ){
5891             int c;
5892             for( c = 0; c < 2; c++ ) {
5893                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5894                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5895             }
5896         }
5897
5898         if( cbp&0x20 ) {
5899             int c, i;
5900             for( c = 0; c < 2; c++ ) {
5901                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5902                 for( i = 0; i < 4; i++ ) {
5903                     const int index = 16 + 4 * c + i;
5904                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5905                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5906                 }
5907             }
5908         } else {
5909             uint8_t * const nnz= &h->non_zero_count_cache[0];
5910             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5911             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5912         }
5913     } else {
5914         uint8_t * const nnz= &h->non_zero_count_cache[0];
5915         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5916         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5917         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5918         h->last_qscale_diff = 0;
5919     }
5920
5921     s->current_picture.qscale_table[mb_xy]= s->qscale;
5922     write_back_non_zero_count(h);
5923
5924     if(MB_MBAFF){
5925         h->ref_count[0] >>= 1;
5926         h->ref_count[1] >>= 1;
5927     }
5928
5929     return 0;
5930 }
5931
5932
5933 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5934     const int index_a = qp + h->slice_alpha_c0_offset;
5935     const int alpha = (alpha_table+52)[index_a];
5936     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5937
5938     if( bS[0] < 4 ) {
5939         int8_t tc[4];
5940         tc[0] = (tc0_table+52)[index_a][bS[0]];
5941         tc[1] = (tc0_table+52)[index_a][bS[1]];
5942         tc[2] = (tc0_table+52)[index_a][bS[2]];
5943         tc[3] = (tc0_table+52)[index_a][bS[3]];
5944         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5945     } else {
5946         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5947     }
5948 }
5949 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5950     const int index_a = qp + h->slice_alpha_c0_offset;
5951     const int alpha = (alpha_table+52)[index_a];
5952     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5953
5954     if( bS[0] < 4 ) {
5955         int8_t tc[4];
5956         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5957         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5958         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5959         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5960         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5961     } else {
5962         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5963     }
5964 }
5965
5966 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5967     int i;
5968     for( i = 0; i < 16; i++, pix += stride) {
5969         int index_a;
5970         int alpha;
5971         int beta;
5972
5973         int qp_index;
5974         int bS_index = (i >> 1);
5975         if (!MB_FIELD) {
5976             bS_index &= ~1;
5977             bS_index |= (i & 1);
5978         }
5979
5980         if( bS[bS_index] == 0 ) {
5981             continue;
5982         }
5983
5984         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5985         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5986         alpha = (alpha_table+52)[index_a];
5987         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5988
5989         if( bS[bS_index] < 4 ) {
5990             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5991             const int p0 = pix[-1];
5992             const int p1 = pix[-2];
5993             const int p2 = pix[-3];
5994             const int q0 = pix[0];
5995             const int q1 = pix[1];
5996             const int q2 = pix[2];
5997
5998             if( FFABS( p0 - q0 ) < alpha &&
5999                 FFABS( p1 - p0 ) < beta &&
6000                 FFABS( q1 - q0 ) < beta ) {
6001                 int tc = tc0;
6002                 int i_delta;
6003
6004                 if( FFABS( p2 - p0 ) < beta ) {
6005                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6006                     tc++;
6007                 }
6008                 if( FFABS( q2 - q0 ) < beta ) {
6009                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6010                     tc++;
6011                 }
6012
6013                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6014                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6015                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6016                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6017             }
6018         }else{
6019             const int p0 = pix[-1];
6020             const int p1 = pix[-2];
6021             const int p2 = pix[-3];
6022
6023             const int q0 = pix[0];
6024             const int q1 = pix[1];
6025             const int q2 = pix[2];
6026
6027             if( FFABS( p0 - q0 ) < alpha &&
6028                 FFABS( p1 - p0 ) < beta &&
6029                 FFABS( q1 - q0 ) < beta ) {
6030
6031                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6032                     if( FFABS( p2 - p0 ) < beta)
6033                     {
6034                         const int p3 = pix[-4];
6035                         /* p0', p1', p2' */
6036                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6037                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6038                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6039                     } else {
6040                         /* p0' */
6041                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6042                     }
6043                     if( FFABS( q2 - q0 ) < beta)
6044                     {
6045                         const int q3 = pix[3];
6046                         /* q0', q1', q2' */
6047                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6048                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6049                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6050                     } else {
6051                         /* q0' */
6052                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6053                     }
6054                 }else{
6055                     /* p0', q0' */
6056                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6057                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6058                 }
6059                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6060             }
6061         }
6062     }
6063 }
6064 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6065     int i;
6066     for( i = 0; i < 8; i++, pix += stride) {
6067         int index_a;
6068         int alpha;
6069         int beta;
6070
6071         int qp_index;
6072         int bS_index = i;
6073
6074         if( bS[bS_index] == 0 ) {
6075             continue;
6076         }
6077
6078         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6079         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6080         alpha = (alpha_table+52)[index_a];
6081         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6082
6083         if( bS[bS_index] < 4 ) {
6084             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6085             const int p0 = pix[-1];
6086             const int p1 = pix[-2];
6087             const int q0 = pix[0];
6088             const int q1 = pix[1];
6089
6090             if( FFABS( p0 - q0 ) < alpha &&
6091                 FFABS( p1 - p0 ) < beta &&
6092                 FFABS( q1 - q0 ) < beta ) {
6093                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6094
6095                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6096                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6097                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6098             }
6099         }else{
6100             const int p0 = pix[-1];
6101             const int p1 = pix[-2];
6102             const int q0 = pix[0];
6103             const int q1 = pix[1];
6104
6105             if( FFABS( p0 - q0 ) < alpha &&
6106                 FFABS( p1 - p0 ) < beta &&
6107                 FFABS( q1 - q0 ) < beta ) {
6108
6109                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6110                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6111                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6112             }
6113         }
6114     }
6115 }
6116
6117 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6118     const int index_a = qp + h->slice_alpha_c0_offset;
6119     const int alpha = (alpha_table+52)[index_a];
6120     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6121
6122     if( bS[0] < 4 ) {
6123         int8_t tc[4];
6124         tc[0] = (tc0_table+52)[index_a][bS[0]];
6125         tc[1] = (tc0_table+52)[index_a][bS[1]];
6126         tc[2] = (tc0_table+52)[index_a][bS[2]];
6127         tc[3] = (tc0_table+52)[index_a][bS[3]];
6128         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6129     } else {
6130         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6131     }
6132 }
6133
6134 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6135     const int index_a = qp + h->slice_alpha_c0_offset;
6136     const int alpha = (alpha_table+52)[index_a];
6137     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6138
6139     if( bS[0] < 4 ) {
6140         int8_t tc[4];
6141         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6142         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6143         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6144         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6145         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6146     } else {
6147         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6148     }
6149 }
6150
6151 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6152     MpegEncContext * const s = &h->s;
6153     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6154     int mb_xy, mb_type;
6155     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6156
6157     mb_xy = h->mb_xy;
6158
6159     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6160         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6161        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6162                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6163         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6164         return;
6165     }
6166     assert(!FRAME_MBAFF);
6167
6168     mb_type = s->current_picture.mb_type[mb_xy];
6169     qp = s->current_picture.qscale_table[mb_xy];
6170     qp0 = s->current_picture.qscale_table[mb_xy-1];
6171     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6172     qpc = get_chroma_qp( h, 0, qp );
6173     qpc0 = get_chroma_qp( h, 0, qp0 );
6174     qpc1 = get_chroma_qp( h, 0, qp1 );
6175     qp0 = (qp + qp0 + 1) >> 1;
6176     qp1 = (qp + qp1 + 1) >> 1;
6177     qpc0 = (qpc + qpc0 + 1) >> 1;
6178     qpc1 = (qpc + qpc1 + 1) >> 1;
6179     qp_thresh = 15 - h->slice_alpha_c0_offset;
6180     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6181        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6182         return;
6183
6184     if( IS_INTRA(mb_type) ) {
6185         int16_t bS4[4] = {4,4,4,4};
6186         int16_t bS3[4] = {3,3,3,3};
6187         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6188         if( IS_8x8DCT(mb_type) ) {
6189             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6190             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6191             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6192             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6193         } else {
6194             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6195             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6196             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6197             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6198             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6199             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6200             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6201             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6202         }
6203         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6204         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6205         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6206         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6207         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6208         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6209         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6210         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6211         return;
6212     } else {
6213         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6214         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6215         int edges;
6216         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6217             edges = 4;
6218             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6219         } else {
6220             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6221                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6222             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6223                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6224                              ? 3 : 0;
6225             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6226             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6227             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6228                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6229         }
6230         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6231             bSv[0][0] = 0x0004000400040004ULL;
6232         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6233             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6234
6235 #define FILTER(hv,dir,edge)\
6236         if(bSv[dir][edge]) {\
6237             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6238             if(!(edge&1)) {\
6239                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6240                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6241             }\
6242         }
6243         if( edges == 1 ) {
6244             FILTER(v,0,0);
6245             FILTER(h,1,0);
6246         } else if( IS_8x8DCT(mb_type) ) {
6247             FILTER(v,0,0);
6248             FILTER(v,0,2);
6249             FILTER(h,1,0);
6250             FILTER(h,1,2);
6251         } else {
6252             FILTER(v,0,0);
6253             FILTER(v,0,1);
6254             FILTER(v,0,2);
6255             FILTER(v,0,3);
6256             FILTER(h,1,0);
6257             FILTER(h,1,1);
6258             FILTER(h,1,2);
6259             FILTER(h,1,3);
6260         }
6261 #undef FILTER
6262     }
6263 }
6264
6265
6266 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6267     MpegEncContext * const s = &h->s;
6268     int edge;
6269     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6270     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6271     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6272     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6273     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6274
6275     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6276                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6277     // how often to recheck mv-based bS when iterating between edges
6278     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6279                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6280     // how often to recheck mv-based bS when iterating along each edge
6281     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6282
6283     if (first_vertical_edge_done) {
6284         start = 1;
6285     }
6286
6287     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6288         start = 1;
6289
6290     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6291         && !IS_INTERLACED(mb_type)
6292         && IS_INTERLACED(mbm_type)
6293         ) {
6294         // This is a special case in the norm where the filtering must
6295         // be done twice (one each of the field) even if we are in a
6296         // frame macroblock.
6297         //
6298         static const int nnz_idx[4] = {4,5,6,3};
6299         unsigned int tmp_linesize   = 2 *   linesize;
6300         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6301         int mbn_xy = mb_xy - 2 * s->mb_stride;
6302         int qp;
6303         int i, j;
6304         int16_t bS[4];
6305
6306         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6307             if( IS_INTRA(mb_type) ||
6308                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6309                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6310             } else {
6311                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6312                 for( i = 0; i < 4; i++ ) {
6313                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6314                         mbn_nnz[nnz_idx[i]] != 0 )
6315                         bS[i] = 2;
6316                     else
6317                         bS[i] = 1;
6318                 }
6319             }
6320             // Do not use s->qscale as luma quantizer because it has not the same
6321             // value in IPCM macroblocks.
6322             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6323             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6324             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6325             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6326             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6327                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6328             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6329                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6330         }
6331
6332         start = 1;
6333     }
6334
6335     /* Calculate bS */
6336     for( edge = start; edge < edges; edge++ ) {
6337         /* mbn_xy: neighbor macroblock */
6338         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6339         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6340         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6341         int16_t bS[4];
6342         int qp;
6343
6344         if( (edge&1) && IS_8x8DCT(mb_type) )
6345             continue;
6346
6347         if( IS_INTRA(mb_type) ||
6348             IS_INTRA(mbn_type) ) {
6349             int value;
6350             if (edge == 0) {
6351                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6352                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6353                 ) {
6354                     value = 4;
6355                 } else {
6356                     value = 3;
6357                 }
6358             } else {
6359                 value = 3;
6360             }
6361             bS[0] = bS[1] = bS[2] = bS[3] = value;
6362         } else {
6363             int i, l;
6364             int mv_done;
6365
6366             if( edge & mask_edge ) {
6367                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6368                 mv_done = 1;
6369             }
6370             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6371                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6372                 mv_done = 1;
6373             }
6374             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6375                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6376                 int bn_idx= b_idx - (dir ? 8:1);
6377                 int v = 0;
6378
6379                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6380                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6381                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6382                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6383                 }
6384
6385                 if(h->slice_type_nos == FF_B_TYPE && v){
6386                     v=0;
6387                     for( l = 0; !v && l < 2; l++ ) {
6388                         int ln= 1-l;
6389                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6390                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6391                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6392                     }
6393                 }
6394
6395                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6396                 mv_done = 1;
6397             }
6398             else
6399                 mv_done = 0;
6400
6401             for( i = 0; i < 4; i++ ) {
6402                 int x = dir == 0 ? edge : i;
6403                 int y = dir == 0 ? i    : edge;
6404                 int b_idx= 8 + 4 + x + 8*y;
6405                 int bn_idx= b_idx - (dir ? 8:1);
6406
6407                 if( h->non_zero_count_cache[b_idx] |
6408                     h->non_zero_count_cache[bn_idx] ) {
6409                     bS[i] = 2;
6410                 }
6411                 else if(!mv_done)
6412                 {
6413                     bS[i] = 0;
6414                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6415                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6416                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6417                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6418                             bS[i] = 1;
6419                             break;
6420                         }
6421                     }
6422
6423                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6424                         bS[i] = 0;
6425                         for( l = 0; l < 2; l++ ) {
6426                             int ln= 1-l;
6427                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6428                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6429                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6430                                 bS[i] = 1;
6431                                 break;
6432                             }
6433                         }
6434                     }
6435                 }
6436             }
6437
6438             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6439                 continue;
6440         }
6441
6442         /* Filter edge */
6443         // Do not use s->qscale as luma quantizer because it has not the same
6444         // value in IPCM macroblocks.
6445         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6446         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6447         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6448         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6449         if( dir == 0 ) {
6450             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6451             if( (edge&1) == 0 ) {
6452                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6453                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6454                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6455                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6456             }
6457         } else {
6458             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6459             if( (edge&1) == 0 ) {
6460                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6461                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6462                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6463                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6464             }
6465         }
6466     }
6467 }
6468
6469 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6470     MpegEncContext * const s = &h->s;
6471     const int mb_xy= mb_x + mb_y*s->mb_stride;
6472     const int mb_type = s->current_picture.mb_type[mb_xy];
6473     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6474     int first_vertical_edge_done = 0;
6475     av_unused int dir;
6476
6477     //for sufficiently low qp, filtering wouldn't do anything
6478     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6479     if(!FRAME_MBAFF){
6480         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6481         int qp = s->current_picture.qscale_table[mb_xy];
6482         if(qp <= qp_thresh
6483            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6484            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6485             return;
6486         }
6487     }
6488
6489     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6490     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6491         int top_type, left_type[2];
6492         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6493         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6494         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6495
6496         if(IS_8x8DCT(top_type)){
6497             h->non_zero_count_cache[4+8*0]=
6498             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6499             h->non_zero_count_cache[6+8*0]=
6500             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6501         }
6502         if(IS_8x8DCT(left_type[0])){
6503             h->non_zero_count_cache[3+8*1]=
6504             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6505         }
6506         if(IS_8x8DCT(left_type[1])){
6507             h->non_zero_count_cache[3+8*3]=
6508             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6509         }
6510
6511         if(IS_8x8DCT(mb_type)){
6512             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6513             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6514
6515             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6516             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6517
6518             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6519             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6520
6521             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6522             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6523         }
6524     }
6525
6526     if (FRAME_MBAFF
6527             // left mb is in picture
6528             && h->slice_table[mb_xy-1] != 0xFFFF
6529             // and current and left pair do not have the same interlaced type
6530             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6531             // and left mb is in the same slice if deblocking_filter == 2
6532             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6533         /* First vertical edge is different in MBAFF frames
6534          * There are 8 different bS to compute and 2 different Qp
6535          */
6536         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6537         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6538         int16_t bS[8];
6539         int qp[2];
6540         int bqp[2];
6541         int rqp[2];
6542         int mb_qp, mbn0_qp, mbn1_qp;
6543         int i;
6544         first_vertical_edge_done = 1;
6545
6546         if( IS_INTRA(mb_type) )
6547             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6548         else {
6549             for( i = 0; i < 8; i++ ) {
6550                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6551
6552                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6553                     bS[i] = 4;
6554                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6555                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6556                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6557                                                                        :
6558                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6559                     bS[i] = 2;
6560                 else
6561                     bS[i] = 1;
6562             }
6563         }
6564
6565         mb_qp = s->current_picture.qscale_table[mb_xy];
6566         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6567         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6568         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6569         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6570                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6571         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6572                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6573         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6574         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6575                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6576         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6577                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6578
6579         /* Filter edge */
6580         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6581         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6582         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6583         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6584         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6585     }
6586
6587 #if CONFIG_SMALL
6588     for( dir = 0; dir < 2; dir++ )
6589         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6590 #else
6591     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6592     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6593 #endif
6594 }
6595
6596 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6597     H264Context *h = *(void**)arg;
6598     MpegEncContext * const s = &h->s;
6599     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6600
6601     s->mb_skip_run= -1;
6602
6603     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6604                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6605
6606     if( h->pps.cabac ) {
6607         int i;
6608
6609         /* realign */
6610         align_get_bits( &s->gb );
6611
6612         /* init cabac */
6613         ff_init_cabac_states( &h->cabac);
6614         ff_init_cabac_decoder( &h->cabac,
6615                                s->gb.buffer + get_bits_count(&s->gb)/8,
6616                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6617         /* calculate pre-state */
6618         for( i= 0; i < 460; i++ ) {
6619             int pre;
6620             if( h->slice_type_nos == FF_I_TYPE )
6621                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6622             else
6623                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6624
6625             if( pre <= 63 )
6626                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6627             else
6628                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6629         }
6630
6631         for(;;){
6632 //START_TIMER
6633             int ret = decode_mb_cabac(h);
6634             int eos;
6635 //STOP_TIMER("decode_mb_cabac")
6636
6637             if(ret>=0) hl_decode_mb(h);
6638
6639             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6640                 s->mb_y++;
6641
6642                 ret = decode_mb_cabac(h);
6643
6644                 if(ret>=0) hl_decode_mb(h);
6645                 s->mb_y--;
6646             }
6647             eos = get_cabac_terminate( &h->cabac );
6648
6649             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6650                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6651                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6652                 return -1;
6653             }
6654
6655             if( ++s->mb_x >= s->mb_width ) {
6656                 s->mb_x = 0;
6657                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6658                 ++s->mb_y;
6659                 if(FIELD_OR_MBAFF_PICTURE) {
6660                     ++s->mb_y;
6661                 }
6662             }
6663
6664             if( eos || s->mb_y >= s->mb_height ) {
6665                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6666                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6667                 return 0;
6668             }
6669         }
6670
6671     } else {
6672         for(;;){
6673             int ret = decode_mb_cavlc(h);
6674
6675             if(ret>=0) hl_decode_mb(h);
6676
6677             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6678                 s->mb_y++;
6679                 ret = decode_mb_cavlc(h);
6680
6681                 if(ret>=0) hl_decode_mb(h);
6682                 s->mb_y--;
6683             }
6684
6685             if(ret<0){
6686                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6687                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6688
6689                 return -1;
6690             }
6691
6692             if(++s->mb_x >= s->mb_width){
6693                 s->mb_x=0;
6694                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6695                 ++s->mb_y;
6696                 if(FIELD_OR_MBAFF_PICTURE) {
6697                     ++s->mb_y;
6698                 }
6699                 if(s->mb_y >= s->mb_height){
6700                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6701
6702                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6703                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6704
6705                         return 0;
6706                     }else{
6707                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6708
6709                         return -1;
6710                     }
6711                 }
6712             }
6713
6714             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6715                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6716                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6717                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6718
6719                     return 0;
6720                 }else{
6721                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6722
6723                     return -1;
6724                 }
6725             }
6726         }
6727     }
6728
6729 #if 0
6730     for(;s->mb_y < s->mb_height; s->mb_y++){
6731         for(;s->mb_x < s->mb_width; s->mb_x++){
6732             int ret= decode_mb(h);
6733
6734             hl_decode_mb(h);
6735
6736             if(ret<0){
6737                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6738                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6739
6740                 return -1;
6741             }
6742
6743             if(++s->mb_x >= s->mb_width){
6744                 s->mb_x=0;
6745                 if(++s->mb_y >= s->mb_height){
6746                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6747                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6748
6749                         return 0;
6750                     }else{
6751                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6752
6753                         return -1;
6754                     }
6755                 }
6756             }
6757
6758             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6759                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6760                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6761
6762                     return 0;
6763                 }else{
6764                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6765
6766                     return -1;
6767                 }
6768             }
6769         }
6770         s->mb_x=0;
6771         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6772     }
6773 #endif
6774     return -1; //not reached
6775 }
6776
6777 static int decode_picture_timing(H264Context *h){
6778     MpegEncContext * const s = &h->s;
6779     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6780         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6781         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6782     }
6783     if(h->sps.pic_struct_present_flag){
6784         unsigned int i, num_clock_ts;
6785         h->sei_pic_struct = get_bits(&s->gb, 4);
6786
6787         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6788             return -1;
6789
6790         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6791
6792         for (i = 0 ; i < num_clock_ts ; i++){
6793             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6794                 unsigned int full_timestamp_flag;
6795                 skip_bits(&s->gb, 2);                 /* ct_type */
6796                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6797                 skip_bits(&s->gb, 5);                 /* counting_type */
6798                 full_timestamp_flag = get_bits(&s->gb, 1);
6799                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6800                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6801                 skip_bits(&s->gb, 8);                 /* n_frames */
6802                 if(full_timestamp_flag){
6803                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6804                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6805                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6806                 }else{
6807                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6808                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6809                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6810                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6811                             if(get_bits(&s->gb, 1))   /* hours_flag */
6812                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6813                         }
6814                     }
6815                 }
6816                 if(h->sps.time_offset_length > 0)
6817                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6818             }
6819         }
6820     }
6821     return 0;
6822 }
6823
6824 static int decode_unregistered_user_data(H264Context *h, int size){
6825     MpegEncContext * const s = &h->s;
6826     uint8_t user_data[16+256];
6827     int e, build, i;
6828
6829     if(size<16)
6830         return -1;
6831
6832     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6833         user_data[i]= get_bits(&s->gb, 8);
6834     }
6835
6836     user_data[i]= 0;
6837     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6838     if(e==1 && build>=0)
6839         h->x264_build= build;
6840
6841     if(s->avctx->debug & FF_DEBUG_BUGS)
6842         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6843
6844     for(; i<size; i++)
6845         skip_bits(&s->gb, 8);
6846
6847     return 0;
6848 }
6849
6850 static int decode_recovery_point(H264Context *h){
6851     MpegEncContext * const s = &h->s;
6852
6853     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6854     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6855
6856     return 0;
6857 }
6858
6859 static int decode_buffering_period(H264Context *h){
6860     MpegEncContext * const s = &h->s;
6861     unsigned int sps_id;
6862     int sched_sel_idx;
6863     SPS *sps;
6864
6865     sps_id = get_ue_golomb_31(&s->gb);
6866     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6867         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6868         return -1;
6869     }
6870     sps = h->sps_buffers[sps_id];
6871
6872     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6873     if (sps->nal_hrd_parameters_present_flag) {
6874         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6875             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6876             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6877         }
6878     }
6879     if (sps->vcl_hrd_parameters_present_flag) {
6880         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6881             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6882             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6883         }
6884     }
6885
6886     h->sei_buffering_period_present = 1;
6887     return 0;
6888 }
6889
6890 int ff_h264_decode_sei(H264Context *h){
6891     MpegEncContext * const s = &h->s;
6892
6893     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6894         int size, type;
6895
6896         type=0;
6897         do{
6898             type+= show_bits(&s->gb, 8);
6899         }while(get_bits(&s->gb, 8) == 255);
6900
6901         size=0;
6902         do{
6903             size+= show_bits(&s->gb, 8);
6904         }while(get_bits(&s->gb, 8) == 255);
6905
6906         switch(type){
6907         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6908             if(decode_picture_timing(h) < 0)
6909                 return -1;
6910             break;
6911         case SEI_TYPE_USER_DATA_UNREGISTERED:
6912             if(decode_unregistered_user_data(h, size) < 0)
6913                 return -1;
6914             break;
6915         case SEI_TYPE_RECOVERY_POINT:
6916             if(decode_recovery_point(h) < 0)
6917                 return -1;
6918             break;
6919         case SEI_BUFFERING_PERIOD:
6920             if(decode_buffering_period(h) < 0)
6921                 return -1;
6922             break;
6923         default:
6924             skip_bits(&s->gb, 8*size);
6925         }
6926
6927         //FIXME check bits here
6928         align_get_bits(&s->gb);
6929     }
6930
6931     return 0;
6932 }
6933
6934 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6935     MpegEncContext * const s = &h->s;
6936     int cpb_count, i;
6937     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6938
6939     if(cpb_count > 32U){
6940         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6941         return -1;
6942     }
6943
6944     get_bits(&s->gb, 4); /* bit_rate_scale */
6945     get_bits(&s->gb, 4); /* cpb_size_scale */
6946     for(i=0; i<cpb_count; i++){
6947         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6948         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6949         get_bits1(&s->gb);     /* cbr_flag */
6950     }
6951     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6952     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6953     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6954     sps->time_offset_length = get_bits(&s->gb, 5);
6955     sps->cpb_cnt = cpb_count;
6956     return 0;
6957 }
6958
6959 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6960     MpegEncContext * const s = &h->s;
6961     int aspect_ratio_info_present_flag;
6962     unsigned int aspect_ratio_idc;
6963
6964     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6965
6966     if( aspect_ratio_info_present_flag ) {
6967         aspect_ratio_idc= get_bits(&s->gb, 8);
6968         if( aspect_ratio_idc == EXTENDED_SAR ) {
6969             sps->sar.num= get_bits(&s->gb, 16);
6970             sps->sar.den= get_bits(&s->gb, 16);
6971         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6972             sps->sar=  pixel_aspect[aspect_ratio_idc];
6973         }else{
6974             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6975             return -1;
6976         }
6977     }else{
6978         sps->sar.num=
6979         sps->sar.den= 0;
6980     }
6981 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6982
6983     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6984         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6985     }
6986
6987     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6988         get_bits(&s->gb, 3);    /* video_format */
6989         get_bits1(&s->gb);      /* video_full_range_flag */
6990         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6991             get_bits(&s->gb, 8); /* colour_primaries */
6992             get_bits(&s->gb, 8); /* transfer_characteristics */
6993             get_bits(&s->gb, 8); /* matrix_coefficients */
6994         }
6995     }
6996
6997     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6998         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6999         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7000     }
7001
7002     sps->timing_info_present_flag = get_bits1(&s->gb);
7003     if(sps->timing_info_present_flag){
7004         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7005         sps->time_scale = get_bits_long(&s->gb, 32);
7006         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7007     }
7008
7009     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7010     if(sps->nal_hrd_parameters_present_flag)
7011         if(decode_hrd_parameters(h, sps) < 0)
7012             return -1;
7013     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7014     if(sps->vcl_hrd_parameters_present_flag)
7015         if(decode_hrd_parameters(h, sps) < 0)
7016             return -1;
7017     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7018         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7019     sps->pic_struct_present_flag = get_bits1(&s->gb);
7020
7021     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7022     if(sps->bitstream_restriction_flag){
7023         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7024         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7025         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7026         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7027         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7028         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7029         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7030
7031         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7032             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7033             return -1;
7034         }
7035     }
7036
7037     return 0;
7038 }
7039
7040 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7041                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7042     MpegEncContext * const s = &h->s;
7043     int i, last = 8, next = 8;
7044     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7045     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7046         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7047     else
7048     for(i=0;i<size;i++){
7049         if(next)
7050             next = (last + get_se_golomb(&s->gb)) & 0xff;
7051         if(!i && !next){ /* matrix not written, we use the preset one */
7052             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7053             break;
7054         }
7055         last = factors[scan[i]] = next ? next : last;
7056     }
7057 }
7058
7059 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7060                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7061     MpegEncContext * const s = &h->s;
7062     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7063     const uint8_t *fallback[4] = {
7064         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7065         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7066         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7067         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7068     };
7069     if(get_bits1(&s->gb)){
7070         sps->scaling_matrix_present |= is_sps;
7071         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7072         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7073         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7074         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7075         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7076         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7077         if(is_sps || pps->transform_8x8_mode){
7078             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7079             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7080         }
7081     }
7082 }
7083
7084 int ff_h264_decode_seq_parameter_set(H264Context *h){
7085     MpegEncContext * const s = &h->s;
7086     int profile_idc, level_idc;
7087     unsigned int sps_id;
7088     int i;
7089     SPS *sps;
7090
7091     profile_idc= get_bits(&s->gb, 8);
7092     get_bits1(&s->gb);   //constraint_set0_flag
7093     get_bits1(&s->gb);   //constraint_set1_flag
7094     get_bits1(&s->gb);   //constraint_set2_flag
7095     get_bits1(&s->gb);   //constraint_set3_flag
7096     get_bits(&s->gb, 4); // reserved
7097     level_idc= get_bits(&s->gb, 8);
7098     sps_id= get_ue_golomb_31(&s->gb);
7099
7100     if(sps_id >= MAX_SPS_COUNT) {
7101         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7102         return -1;
7103     }
7104     sps= av_mallocz(sizeof(SPS));
7105     if(sps == NULL)
7106         return -1;
7107
7108     sps->profile_idc= profile_idc;
7109     sps->level_idc= level_idc;
7110
7111     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7112     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7113     sps->scaling_matrix_present = 0;
7114
7115     if(sps->profile_idc >= 100){ //high profile
7116         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7117         if(sps->chroma_format_idc == 3)
7118             sps->residual_color_transform_flag = get_bits1(&s->gb);
7119         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7120         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7121         sps->transform_bypass = get_bits1(&s->gb);
7122         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7123     }else{
7124         sps->chroma_format_idc= 1;
7125     }
7126
7127     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7128     sps->poc_type= get_ue_golomb_31(&s->gb);
7129
7130     if(sps->poc_type == 0){ //FIXME #define
7131         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7132     } else if(sps->poc_type == 1){//FIXME #define
7133         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7134         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7135         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7136         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7137
7138         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7139             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7140             goto fail;
7141         }
7142
7143         for(i=0; i<sps->poc_cycle_length; i++)
7144             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7145     }else if(sps->poc_type != 2){
7146         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7147         goto fail;
7148     }
7149
7150     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7151     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7152         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7153         goto fail;
7154     }
7155     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7156     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7157     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7158     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7159        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7160         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7161         goto fail;
7162     }
7163
7164     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7165     if(!sps->frame_mbs_only_flag)
7166         sps->mb_aff= get_bits1(&s->gb);
7167     else
7168         sps->mb_aff= 0;
7169
7170     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7171
7172 #ifndef ALLOW_INTERLACE
7173     if(sps->mb_aff)
7174         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7175 #endif
7176     sps->crop= get_bits1(&s->gb);
7177     if(sps->crop){
7178         sps->crop_left  = get_ue_golomb(&s->gb);
7179         sps->crop_right = get_ue_golomb(&s->gb);
7180         sps->crop_top   = get_ue_golomb(&s->gb);
7181         sps->crop_bottom= get_ue_golomb(&s->gb);
7182         if(sps->crop_left || sps->crop_top){
7183             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7184         }
7185         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7186             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7187         }
7188     }else{
7189         sps->crop_left  =
7190         sps->crop_right =
7191         sps->crop_top   =
7192         sps->crop_bottom= 0;
7193     }
7194
7195     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7196     if( sps->vui_parameters_present_flag )
7197         decode_vui_parameters(h, sps);
7198
7199     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7200         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7201                sps_id, sps->profile_idc, sps->level_idc,
7202                sps->poc_type,
7203                sps->ref_frame_count,
7204                sps->mb_width, sps->mb_height,
7205                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7206                sps->direct_8x8_inference_flag ? "8B8" : "",
7207                sps->crop_left, sps->crop_right,
7208                sps->crop_top, sps->crop_bottom,
7209                sps->vui_parameters_present_flag ? "VUI" : "",
7210                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7211                );
7212     }
7213
7214     av_free(h->sps_buffers[sps_id]);
7215     h->sps_buffers[sps_id]= sps;
7216     h->sps = *sps;
7217     return 0;
7218 fail:
7219     av_free(sps);
7220     return -1;
7221 }
7222
7223 static void
7224 build_qp_table(PPS *pps, int t, int index)
7225 {
7226     int i;
7227     for(i = 0; i < 52; i++)
7228         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7229 }
7230
7231 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7232     MpegEncContext * const s = &h->s;
7233     unsigned int pps_id= get_ue_golomb(&s->gb);
7234     PPS *pps;
7235
7236     if(pps_id >= MAX_PPS_COUNT) {
7237         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7238         return -1;
7239     }
7240
7241     pps= av_mallocz(sizeof(PPS));
7242     if(pps == NULL)
7243         return -1;
7244     pps->sps_id= get_ue_golomb_31(&s->gb);
7245     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7246         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7247         goto fail;
7248     }
7249
7250     pps->cabac= get_bits1(&s->gb);
7251     pps->pic_order_present= get_bits1(&s->gb);
7252     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7253     if(pps->slice_group_count > 1 ){
7254         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7255         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7256         switch(pps->mb_slice_group_map_type){
7257         case 0:
7258 #if 0
7259 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7260 |    run_length[ i ]                                |1  |ue(v)   |
7261 #endif
7262             break;
7263         case 2:
7264 #if 0
7265 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7266 |{                                                  |   |        |
7267 |    top_left_mb[ i ]                               |1  |ue(v)   |
7268 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7269 |   }                                               |   |        |
7270 #endif
7271             break;
7272         case 3:
7273         case 4:
7274         case 5:
7275 #if 0
7276 |   slice_group_change_direction_flag               |1  |u(1)    |
7277 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7278 #endif
7279             break;
7280         case 6:
7281 #if 0
7282 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7283 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7284 |)                                                  |   |        |
7285 |    slice_group_id[ i ]                            |1  |u(v)    |
7286 #endif
7287             break;
7288         }
7289     }
7290     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7291     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7292     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7293         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7294         goto fail;
7295     }
7296
7297     pps->weighted_pred= get_bits1(&s->gb);
7298     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7299     pps->init_qp= get_se_golomb(&s->gb) + 26;
7300     pps->init_qs= get_se_golomb(&s->gb) + 26;
7301     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7302     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7303     pps->constrained_intra_pred= get_bits1(&s->gb);
7304     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7305
7306     pps->transform_8x8_mode= 0;
7307     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7308     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7309     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7310
7311     if(get_bits_count(&s->gb) < bit_length){
7312         pps->transform_8x8_mode= get_bits1(&s->gb);
7313         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7314         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7315     } else {
7316         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7317     }
7318
7319     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7320     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7321     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7322         h->pps.chroma_qp_diff= 1;
7323
7324     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7325         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7326                pps_id, pps->sps_id,
7327                pps->cabac ? "CABAC" : "CAVLC",
7328                pps->slice_group_count,
7329                pps->ref_count[0], pps->ref_count[1],
7330                pps->weighted_pred ? "weighted" : "",
7331                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7332                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7333                pps->constrained_intra_pred ? "CONSTR" : "",
7334                pps->redundant_pic_cnt_present ? "REDU" : "",
7335                pps->transform_8x8_mode ? "8x8DCT" : ""
7336                );
7337     }
7338
7339     av_free(h->pps_buffers[pps_id]);
7340     h->pps_buffers[pps_id]= pps;
7341     return 0;
7342 fail:
7343     av_free(pps);
7344     return -1;
7345 }
7346
7347 /**
7348  * Call decode_slice() for each context.
7349  *
7350  * @param h h264 master context
7351  * @param context_count number of contexts to execute
7352  */
7353 static void execute_decode_slices(H264Context *h, int context_count){
7354     MpegEncContext * const s = &h->s;
7355     AVCodecContext * const avctx= s->avctx;
7356     H264Context *hx;
7357     int i;
7358
7359     if (s->avctx->hwaccel)
7360         return;
7361     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7362         return;
7363     if(context_count == 1) {
7364         decode_slice(avctx, &h);
7365     } else {
7366         for(i = 1; i < context_count; i++) {
7367             hx = h->thread_context[i];
7368             hx->s.error_recognition = avctx->error_recognition;
7369             hx->s.error_count = 0;
7370         }
7371
7372         avctx->execute(avctx, (void *)decode_slice,
7373                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7374
7375         /* pull back stuff from slices to master context */
7376         hx = h->thread_context[context_count - 1];
7377         s->mb_x = hx->s.mb_x;
7378         s->mb_y = hx->s.mb_y;
7379         s->dropable = hx->s.dropable;
7380         s->picture_structure = hx->s.picture_structure;
7381         for(i = 1; i < context_count; i++)
7382             h->s.error_count += h->thread_context[i]->s.error_count;
7383     }
7384 }
7385
7386
7387 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7388     MpegEncContext * const s = &h->s;
7389     AVCodecContext * const avctx= s->avctx;
7390     int buf_index=0;
7391     H264Context *hx; ///< thread context
7392     int context_count = 0;
7393
7394     h->max_contexts = avctx->thread_count;
7395 #if 0
7396     int i;
7397     for(i=0; i<50; i++){
7398         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7399     }
7400 #endif
7401     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7402         h->current_slice = 0;
7403         if (!s->first_field)
7404             s->current_picture_ptr= NULL;
7405     }
7406
7407     for(;;){
7408         int consumed;
7409         int dst_length;
7410         int bit_length;
7411         const uint8_t *ptr;
7412         int i, nalsize = 0;
7413         int err;
7414
7415         if(h->is_avc) {
7416             if(buf_index >= buf_size) break;
7417             nalsize = 0;
7418             for(i = 0; i < h->nal_length_size; i++)
7419                 nalsize = (nalsize << 8) | buf[buf_index++];
7420             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7421                 if(nalsize == 1){
7422                     buf_index++;
7423                     continue;
7424                 }else{
7425                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7426                     break;
7427                 }
7428             }
7429         } else {
7430             // start code prefix search
7431             for(; buf_index + 3 < buf_size; buf_index++){
7432                 // This should always succeed in the first iteration.
7433                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7434                     break;
7435             }
7436
7437             if(buf_index+3 >= buf_size) break;
7438
7439             buf_index+=3;
7440         }
7441
7442         hx = h->thread_context[context_count];
7443
7444         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7445         if (ptr==NULL || dst_length < 0){
7446             return -1;
7447         }
7448         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7449             dst_length--;
7450         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7451
7452         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7453             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7454         }
7455
7456         if (h->is_avc && (nalsize != consumed)){
7457             int i, debug_level = AV_LOG_DEBUG;
7458             for (i = consumed; i < nalsize; i++)
7459                 if (buf[buf_index+i])
7460                     debug_level = AV_LOG_ERROR;
7461             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7462             consumed= nalsize;
7463         }
7464
7465         buf_index += consumed;
7466
7467         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7468            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7469             continue;
7470
7471       again:
7472         err = 0;
7473         switch(hx->nal_unit_type){
7474         case NAL_IDR_SLICE:
7475             if (h->nal_unit_type != NAL_IDR_SLICE) {
7476                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7477                 return -1;
7478             }
7479             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7480         case NAL_SLICE:
7481             init_get_bits(&hx->s.gb, ptr, bit_length);
7482             hx->intra_gb_ptr=
7483             hx->inter_gb_ptr= &hx->s.gb;
7484             hx->s.data_partitioning = 0;
7485
7486             if((err = decode_slice_header(hx, h)))
7487                break;
7488
7489             s->current_picture_ptr->key_frame |=
7490                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7491                     (h->sei_recovery_frame_cnt >= 0);
7492             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7493                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7494                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7495                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7496                && avctx->skip_frame < AVDISCARD_ALL){
7497                 if(avctx->hwaccel) {
7498                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7499                         return -1;
7500                 }else
7501                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7502                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7503                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7504                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7505                 }else
7506                     context_count++;
7507             }
7508             break;
7509         case NAL_DPA:
7510             init_get_bits(&hx->s.gb, ptr, bit_length);
7511             hx->intra_gb_ptr=
7512             hx->inter_gb_ptr= NULL;
7513             hx->s.data_partitioning = 1;
7514
7515             err = decode_slice_header(hx, h);
7516             break;
7517         case NAL_DPB:
7518             init_get_bits(&hx->intra_gb, ptr, bit_length);
7519             hx->intra_gb_ptr= &hx->intra_gb;
7520             break;
7521         case NAL_DPC:
7522             init_get_bits(&hx->inter_gb, ptr, bit_length);
7523             hx->inter_gb_ptr= &hx->inter_gb;
7524
7525             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7526                && s->context_initialized
7527                && s->hurry_up < 5
7528                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7529                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7530                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7531                && avctx->skip_frame < AVDISCARD_ALL)
7532                 context_count++;
7533             break;
7534         case NAL_SEI:
7535             init_get_bits(&s->gb, ptr, bit_length);
7536             ff_h264_decode_sei(h);
7537             break;
7538         case NAL_SPS:
7539             init_get_bits(&s->gb, ptr, bit_length);
7540             ff_h264_decode_seq_parameter_set(h);
7541
7542             if(s->flags& CODEC_FLAG_LOW_DELAY)
7543                 s->low_delay=1;
7544
7545             if(avctx->has_b_frames < 2)
7546                 avctx->has_b_frames= !s->low_delay;
7547             break;
7548         case NAL_PPS:
7549             init_get_bits(&s->gb, ptr, bit_length);
7550
7551             ff_h264_decode_picture_parameter_set(h, bit_length);
7552
7553             break;
7554         case NAL_AUD:
7555         case NAL_END_SEQUENCE:
7556         case NAL_END_STREAM:
7557         case NAL_FILLER_DATA:
7558         case NAL_SPS_EXT:
7559         case NAL_AUXILIARY_SLICE:
7560             break;
7561         default:
7562             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7563         }
7564
7565         if(context_count == h->max_contexts) {
7566             execute_decode_slices(h, context_count);
7567             context_count = 0;
7568         }
7569
7570         if (err < 0)
7571             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7572         else if(err == 1) {
7573             /* Slice could not be decoded in parallel mode, copy down
7574              * NAL unit stuff to context 0 and restart. Note that
7575              * rbsp_buffer is not transferred, but since we no longer
7576              * run in parallel mode this should not be an issue. */
7577             h->nal_unit_type = hx->nal_unit_type;
7578             h->nal_ref_idc   = hx->nal_ref_idc;
7579             hx = h;
7580             goto again;
7581         }
7582     }
7583     if(context_count)
7584         execute_decode_slices(h, context_count);
7585     return buf_index;
7586 }
7587
7588 /**
7589  * returns the number of bytes consumed for building the current frame
7590  */
7591 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7592         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7593         if(pos+10>buf_size) pos=buf_size; // oops ;)
7594
7595         return pos;
7596 }
7597
7598 static int decode_frame(AVCodecContext *avctx,
7599                              void *data, int *data_size,
7600                              const uint8_t *buf, int buf_size)
7601 {
7602     H264Context *h = avctx->priv_data;
7603     MpegEncContext *s = &h->s;
7604     AVFrame *pict = data;
7605     int buf_index;
7606
7607     s->flags= avctx->flags;
7608     s->flags2= avctx->flags2;
7609
7610    /* end of stream, output what is still in the buffers */
7611     if (buf_size == 0) {
7612         Picture *out;
7613         int i, out_idx;
7614
7615 //FIXME factorize this with the output code below
7616         out = h->delayed_pic[0];
7617         out_idx = 0;
7618         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7619             if(h->delayed_pic[i]->poc < out->poc){
7620                 out = h->delayed_pic[i];
7621                 out_idx = i;
7622             }
7623
7624         for(i=out_idx; h->delayed_pic[i]; i++)
7625             h->delayed_pic[i] = h->delayed_pic[i+1];
7626
7627         if(out){
7628             *data_size = sizeof(AVFrame);
7629             *pict= *(AVFrame*)out;
7630         }
7631
7632         return 0;
7633     }
7634
7635     if(h->is_avc && !h->got_avcC) {
7636         int i, cnt, nalsize;
7637         unsigned char *p = avctx->extradata;
7638         if(avctx->extradata_size < 7) {
7639             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7640             return -1;
7641         }
7642         if(*p != 1) {
7643             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7644             return -1;
7645         }
7646         /* sps and pps in the avcC always have length coded with 2 bytes,
7647            so put a fake nal_length_size = 2 while parsing them */
7648         h->nal_length_size = 2;
7649         // Decode sps from avcC
7650         cnt = *(p+5) & 0x1f; // Number of sps
7651         p += 6;
7652         for (i = 0; i < cnt; i++) {
7653             nalsize = AV_RB16(p) + 2;
7654             if(decode_nal_units(h, p, nalsize) < 0) {
7655                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7656                 return -1;
7657             }
7658             p += nalsize;
7659         }
7660         // Decode pps from avcC
7661         cnt = *(p++); // Number of pps
7662         for (i = 0; i < cnt; i++) {
7663             nalsize = AV_RB16(p) + 2;
7664             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7665                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7666                 return -1;
7667             }
7668             p += nalsize;
7669         }
7670         // Now store right nal length size, that will be use to parse all other nals
7671         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7672         // Do not reparse avcC
7673         h->got_avcC = 1;
7674     }
7675
7676     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7677         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7678             return -1;
7679         h->got_avcC = 1;
7680     }
7681
7682     buf_index=decode_nal_units(h, buf, buf_size);
7683     if(buf_index < 0)
7684         return -1;
7685
7686     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7687         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7688         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7689         return -1;
7690     }
7691
7692     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7693         Picture *out = s->current_picture_ptr;
7694         Picture *cur = s->current_picture_ptr;
7695         int i, pics, cross_idr, out_of_order, out_idx;
7696
7697         s->mb_y= 0;
7698
7699         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7700         s->current_picture_ptr->pict_type= s->pict_type;
7701
7702         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7703             ff_vdpau_h264_set_reference_frames(s);
7704
7705         if(!s->dropable) {
7706             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7707             h->prev_poc_msb= h->poc_msb;
7708             h->prev_poc_lsb= h->poc_lsb;
7709         }
7710         h->prev_frame_num_offset= h->frame_num_offset;
7711         h->prev_frame_num= h->frame_num;
7712
7713         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7714             ff_vdpau_h264_picture_complete(s);
7715
7716         /*
7717          * FIXME: Error handling code does not seem to support interlaced
7718          * when slices span multiple rows
7719          * The ff_er_add_slice calls don't work right for bottom
7720          * fields; they cause massive erroneous error concealing
7721          * Error marking covers both fields (top and bottom).
7722          * This causes a mismatched s->error_count
7723          * and a bad error table. Further, the error count goes to
7724          * INT_MAX when called for bottom field, because mb_y is
7725          * past end by one (callers fault) and resync_mb_y != 0
7726          * causes problems for the first MB line, too.
7727          */
7728         if (!FIELD_PICTURE)
7729             ff_er_frame_end(s);
7730
7731         MPV_frame_end(s);
7732         h->sei_recovery_frame_cnt = -1;
7733         h->sei_dpb_output_delay = 0;
7734         h->sei_cpb_removal_delay = -1;
7735         h->sei_buffering_period_present = 0;
7736
7737         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7738             /* Wait for second field. */
7739             *data_size = 0;
7740
7741         } else {
7742             cur->repeat_pict = 0;
7743
7744             /* Signal interlacing information externally. */
7745             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7746             if(h->sps.pic_struct_present_flag){
7747                 switch (h->sei_pic_struct)
7748                 {
7749                 case SEI_PIC_STRUCT_FRAME:
7750                     cur->interlaced_frame = 0;
7751                     break;
7752                 case SEI_PIC_STRUCT_TOP_FIELD:
7753                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7754                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7755                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7756                     cur->interlaced_frame = 1;
7757                     break;
7758                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7759                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7760                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7761                     // From these hints, let the applications decide if they apply deinterlacing.
7762                     cur->repeat_pict = 1;
7763                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7764                     break;
7765                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7766                     // Force progressive here, as doubling interlaced frame is a bad idea.
7767                     cur->interlaced_frame = 0;
7768                     cur->repeat_pict = 2;
7769                     break;
7770                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7771                     cur->interlaced_frame = 0;
7772                     cur->repeat_pict = 4;
7773                     break;
7774                 }
7775             }else{
7776                 /* Derive interlacing flag from used decoding process. */
7777                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7778             }
7779
7780             if (cur->field_poc[0] != cur->field_poc[1]){
7781                 /* Derive top_field_first from field pocs. */
7782                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7783             }else{
7784                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7785                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7786                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7787                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7788                         cur->top_field_first = 1;
7789                     else
7790                         cur->top_field_first = 0;
7791                 }else{
7792                     /* Most likely progressive */
7793                     cur->top_field_first = 0;
7794                 }
7795             }
7796
7797         //FIXME do something with unavailable reference frames
7798
7799             /* Sort B-frames into display order */
7800
7801             if(h->sps.bitstream_restriction_flag
7802                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7803                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7804                 s->low_delay = 0;
7805             }
7806
7807             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7808                && !h->sps.bitstream_restriction_flag){
7809                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7810                 s->low_delay= 0;
7811             }
7812
7813             pics = 0;
7814             while(h->delayed_pic[pics]) pics++;
7815
7816             assert(pics <= MAX_DELAYED_PIC_COUNT);
7817
7818             h->delayed_pic[pics++] = cur;
7819             if(cur->reference == 0)
7820                 cur->reference = DELAYED_PIC_REF;
7821
7822             out = h->delayed_pic[0];
7823             out_idx = 0;
7824             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7825                 if(h->delayed_pic[i]->poc < out->poc){
7826                     out = h->delayed_pic[i];
7827                     out_idx = i;
7828                 }
7829             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7830
7831             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7832
7833             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7834                 { }
7835             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7836                || (s->low_delay &&
7837                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7838                  || cur->pict_type == FF_B_TYPE)))
7839             {
7840                 s->low_delay = 0;
7841                 s->avctx->has_b_frames++;
7842             }
7843
7844             if(out_of_order || pics > s->avctx->has_b_frames){
7845                 out->reference &= ~DELAYED_PIC_REF;
7846                 for(i=out_idx; h->delayed_pic[i]; i++)
7847                     h->delayed_pic[i] = h->delayed_pic[i+1];
7848             }
7849             if(!out_of_order && pics > s->avctx->has_b_frames){
7850                 *data_size = sizeof(AVFrame);
7851
7852                 h->outputed_poc = out->poc;
7853                 *pict= *(AVFrame*)out;
7854             }else{
7855                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7856             }
7857         }
7858     }
7859
7860     assert(pict->data[0] || !*data_size);
7861     ff_print_debug_info(s, pict);
7862 //printf("out %d\n", (int)pict->data[0]);
7863 #if 0 //?
7864
7865     /* Return the Picture timestamp as the frame number */
7866     /* we subtract 1 because it is added on utils.c     */
7867     avctx->frame_number = s->picture_number - 1;
7868 #endif
7869     return get_consumed_bytes(s, buf_index, buf_size);
7870 }
7871 #if 0
7872 static inline void fill_mb_avail(H264Context *h){
7873     MpegEncContext * const s = &h->s;
7874     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7875
7876     if(s->mb_y){
7877         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7878         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7879         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7880     }else{
7881         h->mb_avail[0]=
7882         h->mb_avail[1]=
7883         h->mb_avail[2]= 0;
7884     }
7885     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7886     h->mb_avail[4]= 1; //FIXME move out
7887     h->mb_avail[5]= 0; //FIXME move out
7888 }
7889 #endif
7890
7891 #ifdef TEST
7892 #undef printf
7893 #undef random
7894 #define COUNT 8000
7895 #define SIZE (COUNT*40)
7896 int main(void){
7897     int i;
7898     uint8_t temp[SIZE];
7899     PutBitContext pb;
7900     GetBitContext gb;
7901 //    int int_temp[10000];
7902     DSPContext dsp;
7903     AVCodecContext avctx;
7904
7905     dsputil_init(&dsp, &avctx);
7906
7907     init_put_bits(&pb, temp, SIZE);
7908     printf("testing unsigned exp golomb\n");
7909     for(i=0; i<COUNT; i++){
7910         START_TIMER
7911         set_ue_golomb(&pb, i);
7912         STOP_TIMER("set_ue_golomb");
7913     }
7914     flush_put_bits(&pb);
7915
7916     init_get_bits(&gb, temp, 8*SIZE);
7917     for(i=0; i<COUNT; i++){
7918         int j, s;
7919
7920         s= show_bits(&gb, 24);
7921
7922         START_TIMER
7923         j= get_ue_golomb(&gb);
7924         if(j != i){
7925             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7926 //            return -1;
7927         }
7928         STOP_TIMER("get_ue_golomb");
7929     }
7930
7931
7932     init_put_bits(&pb, temp, SIZE);
7933     printf("testing signed exp golomb\n");
7934     for(i=0; i<COUNT; i++){
7935         START_TIMER
7936         set_se_golomb(&pb, i - COUNT/2);
7937         STOP_TIMER("set_se_golomb");
7938     }
7939     flush_put_bits(&pb);
7940
7941     init_get_bits(&gb, temp, 8*SIZE);
7942     for(i=0; i<COUNT; i++){
7943         int j, s;
7944
7945         s= show_bits(&gb, 24);
7946
7947         START_TIMER
7948         j= get_se_golomb(&gb);
7949         if(j != i - COUNT/2){
7950             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7951 //            return -1;
7952         }
7953         STOP_TIMER("get_se_golomb");
7954     }
7955
7956 #if 0
7957     printf("testing 4x4 (I)DCT\n");
7958
7959     DCTELEM block[16];
7960     uint8_t src[16], ref[16];
7961     uint64_t error= 0, max_error=0;
7962
7963     for(i=0; i<COUNT; i++){
7964         int j;
7965 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7966         for(j=0; j<16; j++){
7967             ref[j]= random()%255;
7968             src[j]= random()%255;
7969         }
7970
7971         h264_diff_dct_c(block, src, ref, 4);
7972
7973         //normalize
7974         for(j=0; j<16; j++){
7975 //            printf("%d ", block[j]);
7976             block[j]= block[j]*4;
7977             if(j&1) block[j]= (block[j]*4 + 2)/5;
7978             if(j&4) block[j]= (block[j]*4 + 2)/5;
7979         }
7980 //        printf("\n");
7981
7982         s->dsp.h264_idct_add(ref, block, 4);
7983 /*        for(j=0; j<16; j++){
7984             printf("%d ", ref[j]);
7985         }
7986         printf("\n");*/
7987
7988         for(j=0; j<16; j++){
7989             int diff= FFABS(src[j] - ref[j]);
7990
7991             error+= diff*diff;
7992             max_error= FFMAX(max_error, diff);
7993         }
7994     }
7995     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7996     printf("testing quantizer\n");
7997     for(qp=0; qp<52; qp++){
7998         for(i=0; i<16; i++)
7999             src1_block[i]= src2_block[i]= random()%255;
8000
8001     }
8002     printf("Testing NAL layer\n");
8003
8004     uint8_t bitstream[COUNT];
8005     uint8_t nal[COUNT*2];
8006     H264Context h;
8007     memset(&h, 0, sizeof(H264Context));
8008
8009     for(i=0; i<COUNT; i++){
8010         int zeros= i;
8011         int nal_length;
8012         int consumed;
8013         int out_length;
8014         uint8_t *out;
8015         int j;
8016
8017         for(j=0; j<COUNT; j++){
8018             bitstream[j]= (random() % 255) + 1;
8019         }
8020
8021         for(j=0; j<zeros; j++){
8022             int pos= random() % COUNT;
8023             while(bitstream[pos] == 0){
8024                 pos++;
8025                 pos %= COUNT;
8026             }
8027             bitstream[pos]=0;
8028         }
8029
8030         START_TIMER
8031
8032         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8033         if(nal_length<0){
8034             printf("encoding failed\n");
8035             return -1;
8036         }
8037
8038         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8039
8040         STOP_TIMER("NAL")
8041
8042         if(out_length != COUNT){
8043             printf("incorrect length %d %d\n", out_length, COUNT);
8044             return -1;
8045         }
8046
8047         if(consumed != nal_length){
8048             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8049             return -1;
8050         }
8051
8052         if(memcmp(bitstream, out, COUNT)){
8053             printf("mismatch\n");
8054             return -1;
8055         }
8056     }
8057 #endif
8058
8059     printf("Testing RBSP\n");
8060
8061
8062     return 0;
8063 }
8064 #endif /* TEST */
8065
8066
8067 static av_cold int decode_end(AVCodecContext *avctx)
8068 {
8069     H264Context *h = avctx->priv_data;
8070     MpegEncContext *s = &h->s;
8071     int i;
8072
8073     av_freep(&h->rbsp_buffer[0]);
8074     av_freep(&h->rbsp_buffer[1]);
8075     free_tables(h); //FIXME cleanup init stuff perhaps
8076
8077     for(i = 0; i < MAX_SPS_COUNT; i++)
8078         av_freep(h->sps_buffers + i);
8079
8080     for(i = 0; i < MAX_PPS_COUNT; i++)
8081         av_freep(h->pps_buffers + i);
8082
8083     MPV_common_end(s);
8084
8085 //    memset(h, 0, sizeof(H264Context));
8086
8087     return 0;
8088 }
8089
8090
8091 AVCodec h264_decoder = {
8092     "h264",
8093     CODEC_TYPE_VIDEO,
8094     CODEC_ID_H264,
8095     sizeof(H264Context),
8096     decode_init,
8097     NULL,
8098     decode_end,
8099     decode_frame,
8100     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8101     .flush= flush_dpb,
8102     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8103     .pix_fmts= ff_pixfmt_list_420,
8104 };
8105
8106 #if CONFIG_H264_VDPAU_DECODER
8107 AVCodec h264_vdpau_decoder = {
8108     "h264_vdpau",
8109     CODEC_TYPE_VIDEO,
8110     CODEC_ID_H264,
8111     sizeof(H264Context),
8112     decode_init,
8113     NULL,
8114     decode_end,
8115     decode_frame,
8116     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8117     .flush= flush_dpb,
8118     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8119 };
8120 #endif
8121
8122 #if CONFIG_SVQ3_DECODER
8123 #include "svq3.c"
8124 #endif